71 files changed, 21921 insertions, 4923 deletions
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
new file mode 100644
index 0000000..bfe9ab8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=-fp-armv8 -o - %s | FileCheck %s
+
+define half @f2h(float %a) {
+; CHECK-LABEL: f2h:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __gnu_f2h_ieee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to half
+  ret half %0
+}
+
+define bfloat @f2bfloat(float %a) {
+; CHECK-LABEL: f2bfloat:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl __truncsfbf2
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %0 = fptrunc float %a to bfloat
+  ret bfloat %0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 1b86fe6..20215fe9 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -662,17 +662,13 @@ define dso_local i32 @_extract_crng_crng() {
 ; CHECK-NEXT:    cmn x8, #1272
 ; CHECK-NEXT:    b.pl .LBB36_3
 ; CHECK-NEXT:  .LBB36_2: // %if.then
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
 ; CHECK-NEXT:    adrp x8, primary_crng
 ; CHECK-NEXT:    ldr w8, [x8, :lo12:primary_crng]
 ; CHECK-NEXT:    cmp w8, #0
 ; CHECK-NEXT:    adrp x8, input_pool
 ; CHECK-NEXT:    add x8, x8, :lo12:input_pool
 ; CHECK-NEXT:    csel x0, xzr, x8, eq
-; CHECK-NEXT:    bl crng_reseed
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    b crng_reseed
 ; CHECK-NEXT:  .LBB36_3: // %if.end
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
index 00ae34b..217f08b 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
@@ -2,7 +2,8 @@
 
 ; Validates when local linkage functions get a thunk generated.
 
-; Being called does not cause a thunk to be generated.
+; Being called does not cause a thunk to be generated or the symbol name to be mangled.
+; CHECK-NOT: "#does_not_have_addr_taken":
 ; CHECK-NOT:  $ientry_thunk$cdecl$v$f;
 define internal void @does_not_have_addr_taken(float) nounwind {
   ret void
@@ -12,7 +13,8 @@ define void @calls_does_not_have_addr_taken() nounwind {
   ret void
 }
 
-; Having an address taken does cause a thunk to be generated.
+; Having an address taken does cause a thunk to be generated and the symbol name to be mangled.
+; CHECK: "#has_addr_taken":
 ; CHECK: $ientry_thunk$cdecl$v$i8;
 define internal void @has_addr_taken(i64) nounwind {
   ret void
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index 94041bf..e601f03 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -40,7 +40,7 @@ declare dso_local i32 @g(...) local_unnamed_addr
 declare dso_local i32 @i(...) local_unnamed_addr
 
 ; CHECK-LABEL: <test2>:
-; CHECK:         bl {{.*}} <test2+0x18>
+; CHECK:         b {{.*}} <test2+0x1c>
 ; CHECK-LABEL: <$d.5>:
 ; CHECK-LABEL: <$x.6>:
 ; CHECK-NEXT:    b {{.*}} <test2+0x18>
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index 3dba21d..aed3145 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
   ; CHECK-NEXT:    // implicit-def: $p4
   ; CHECK-NEXT:    addvl sp, sp, #1
   ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+  ; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
   ; CHECK-NEXT:    addvl sp, sp, #2
   ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
   ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 213d791..f7920e5 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body:             |
 
 # CHECK:      $sp  = frame-destroy ADDXri $sp, 32, 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10  = frame-destroy LDR_ZXI $sp, 0
 # CHECK-NEXT: $z9  = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8  = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
 # CHECK-NEXT: $sp  = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body:             |
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 1
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
-# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
-# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
-# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
 # CHECK:      $z22 = frame-destroy LDR_ZXI $sp, 3
 # CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
 # CHECK:      $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK:      $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK:      $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK:      $p4 = frame-destroy LDR_PXI $sp, 15
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 18
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body:             |
 # CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $fp, -18
+# CHECK:      $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
 # CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
 # CHECK:      $p5 = frame-destroy LDR_PXI $sp, 14
 # CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK:      $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body:             |
 
 # CHECK:      $sp = frame-destroy ADDVL_XXI $sp, 7
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
-# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
 # CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
 # CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
deleted file mode 100644
index de4baec..0000000
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ /dev/null
@@ -1,101 +0,0 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-#
---- |
-
-  define void @foo() nounwind { entry: unreachable }
-
-  define void @bar() nounwind { entry: unreachable }
-
-  define void @baz() nounwind { entry: unreachable }
-
-...
----
-name:            foo
-# CHECK-LABEL: name: foo
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is 0, so the last ldp in the sequence will also
-  ; restore the stack.
-  ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
-
-  ; The ldp and the stack increment get merged even before
-  ; the load-store optimizer.
-  ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
-
-    RET_ReallyLR
-...
----
-name:            bar
-# CHECK-LABEL: name: bar
-tracksRegLiveness: true
-stack:
-  - { id : 0, size: 8, alignment: 4,
-  stack-id: default, callee-saved-register: '', callee-saved-restored: true,
-  local-offset: -4, debug-info-variable: '', debug-info-expression: '',
-  debug-info-location: '' }
-
-body:             |
-  bb.0:
-    $x19 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-    $x22 = IMPLICIT_DEF
-    $x23 = IMPLICIT_DEF
-    $x24 = IMPLICIT_DEF
-    $x25 = IMPLICIT_DEF
-    $x26 = IMPLICIT_DEF
-
-  ; The local stack size is not 0, and we can combine the CSR stack size with
-  ; the local stack size. This results in rewriting the offsets for all the
-  ; save/restores and forbids us to merge the stack adjustment and the last pop.
-  ; In this case, there is no point of moving the first CSR pair at the end.
-  ; We do it anyway, as it's a small price to pay for the resulting
-  ; simplification in the epilogue emission code.
-  ; CHECK:      $x24, $x23 = frame-destroy LDPXi $sp, 4
-  ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
-  ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
-  ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
-  ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
-    RET_ReallyLR
-...
----
-# Check that the load from the offset 0 is moved at the end even when hasFP is
-# false.
-name:            baz
-# CHECK-LABEL: name: baz
-alignment:       4
-tracksRegLiveness: true
-frameInfo:
-  adjustsStack:    true
-  hasCalls:        true
-body:             |
-  bb.0:
-    successors: %bb.1
-
-    $x0 = IMPLICIT_DEF
-    $x20 = IMPLICIT_DEF
-    $x21 = IMPLICIT_DEF
-
-    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
-    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
-    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
-    B %bb.1
-
-  bb.1:
-   ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
-   ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
-    RET_ReallyLR
-...
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 296f2be..6d2abf7 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
 ; CHECK-NEXT:    ldr z1, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    fadd z0.d, z1.d, z0.d
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
 ; CHECK-NEXT:    ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    and p0.b, p1/z, p1.b, p0.b
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 86918a59..de676ac 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc..ea7808d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,7 +129,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -145,6 +144,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -284,7 +284,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -300,6 +299,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -440,7 +440,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -456,6 +455,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -595,7 +595,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -611,6 +610,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -751,7 +751,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -767,6 +766,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -906,7 +906,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -922,6 +921,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1062,7 +1062,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1078,6 +1077,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1217,7 +1217,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1233,6 +1232,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1380,7 +1380,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1395,6 +1394,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1545,7 +1545,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1560,6 +1559,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1711,7 +1711,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1726,6 +1725,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1877,7 +1877,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1892,6 +1891,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2043,7 +2043,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2058,6 +2057,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2209,7 +2209,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2224,6 +2223,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2375,7 +2375,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2390,6 +2389,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -2541,7 +2541,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2556,6 +2555,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1fb251a..7e2d28f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,7 +82,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -98,6 +97,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -190,7 +190,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -206,6 +205,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -299,7 +299,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -315,6 +314,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -407,7 +407,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -423,6 +422,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -516,7 +516,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -532,6 +531,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -624,7 +624,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -640,6 +639,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -733,7 +733,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -749,6 +748,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -841,7 +841,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z0, [sp]
 ; CONTIGUOUS-NEXT:    ldr z1, [sp, #1, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #2
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -857,6 +856,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #16
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -955,7 +955,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -970,6 +969,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1071,7 +1071,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1086,6 +1085,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1188,7 +1188,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1203,6 +1202,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1304,7 +1304,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1319,6 +1318,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1421,7 +1421,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1436,6 +1435,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1537,7 +1537,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1552,6 +1551,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1654,7 +1654,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1669,6 +1668,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
@@ -1770,7 +1770,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z2, [sp, #2, mul vl]
 ; CONTIGUOUS-NEXT:    ldr z3, [sp, #3, mul vl]
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #4
-; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1785,6 +1784,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
 ; CONTIGUOUS-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CONTIGUOUS-NEXT:    addvl sp, sp, #15
 ; CONTIGUOUS-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CONTIGUOUS-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 1ad7870..56d865e 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,7 +380,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
-; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -397,6 +396,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
 ; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #17
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #4
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index 37186cf..9fa5208 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
 ; NOFP16-NEXT:    .cfi_offset w22, -32
 ; NOFP16-NEXT:    .cfi_offset w30, -48
 ; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w2, #0xffff
+; NOFP16-NEXT:    and w0, w1, #0xffff
 ; NOFP16-NEXT:    mov x19, x3
-; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    mov w20, w2
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
 ; NOFP16-NEXT:    mov w22, w0
 ; NOFP16-NEXT:    and w0, w21, #0xffff
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w8, w0
 ; NOFP16-NEXT:    and w0, w20, #0xffff
+; NOFP16-NEXT:    orr x21, x8, x22, lsl #32
 ; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w8, w21
-; NOFP16-NEXT:    // kill: def $w0 killed $w0 def $x0
-; NOFP16-NEXT:    str w22, [x19, #8]
-; NOFP16-NEXT:    orr x8, x8, x0, lsl #32
+; NOFP16-NEXT:    str x21, [x19]
 ; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    str x8, [x19]
+; NOFP16-NEXT:    str w0, [x19, #8]
 ; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
 ; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
 ; NOFP16-NEXT:    ret
@@ -133,26 +131,107 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
   ret void
 }
 
-; FIXME:
-; define half @f16_return(float %arg) #0 {
-;   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret half %fptrunc
-; }
+ define half @f16_return(float %arg) #0 {
+; NOFP16-LABEL: f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
+; NOFP16-NEXT:    .cfi_offset w30, -16
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret half %fptrunc
+ }
 
-; define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
-;   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <2 x half> %fptrunc
-; }
+ define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; NOFP16-LABEL: v2f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <2 x half> %fptrunc
+ }
 
-; define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
-;   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <3 x half> %fptrunc
-; }
+ define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; NOFP16-LABEL: v3f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 32
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w30, -32
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w2
+; NOFP16-NEXT:    mov w19, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w19
+; NOFP16-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    mov w2, w21
+; NOFP16-NEXT:    ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <3 x half> %fptrunc
+ }
 
-; define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
-;   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-;   ret <4 x half> %fptrunc
-; }
+ define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; NOFP16-LABEL: v4f16_return:
+; NOFP16:       // %bb.0:
+; NOFP16-NEXT:    str x30, [sp, #-48]! // 8-byte Folded Spill
+; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    .cfi_offset w19, -8
+; NOFP16-NEXT:    .cfi_offset w20, -16
+; NOFP16-NEXT:    .cfi_offset w21, -24
+; NOFP16-NEXT:    .cfi_offset w22, -32
+; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    mov w21, w0
+; NOFP16-NEXT:    mov w0, w3
+; NOFP16-NEXT:    mov w19, w2
+; NOFP16-NEXT:    mov w20, w1
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w22, w0
+; NOFP16-NEXT:    mov w0, w19
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w19, w0
+; NOFP16-NEXT:    mov w0, w20
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w20, w0
+; NOFP16-NEXT:    mov w0, w21
+; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    mov w1, w20
+; NOFP16-NEXT:    mov w2, w19
+; NOFP16-NEXT:    mov w3, w22
+; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT:    ldr x30, [sp], #48 // 8-byte Folded Reload
+; NOFP16-NEXT:    ret
+   %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+   ret <4 x half> %fptrunc
+ }
 
 ; FIXME:
 ; define void @outgoing_f16_arg(ptr %ptr) #0 {
@@ -182,46 +261,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
 define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v4f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 48
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w30, -48
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v4f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w20, w1
-; NOFP16-NEXT:    mov w21, w2
-; NOFP16-NEXT:    mov w22, w3
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w20, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <4 x half> @v4f16_result()
   store <4 x half> %val, ptr %ptr
@@ -231,82 +281,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; NOFP16-LABEL: outgoing_v8f16_return:
 ; NOFP16:       // %bb.0:
-; NOFP16-NEXT:    stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NOFP16-NEXT:    stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NOFP16-NEXT:    .cfi_def_cfa_offset 80
+; NOFP16-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT:    .cfi_def_cfa_offset 16
 ; NOFP16-NEXT:    .cfi_offset w19, -8
-; NOFP16-NEXT:    .cfi_offset w20, -16
-; NOFP16-NEXT:    .cfi_offset w21, -24
-; NOFP16-NEXT:    .cfi_offset w22, -32
-; NOFP16-NEXT:    .cfi_offset w23, -40
-; NOFP16-NEXT:    .cfi_offset w24, -48
-; NOFP16-NEXT:    .cfi_offset w25, -56
-; NOFP16-NEXT:    .cfi_offset w26, -64
-; NOFP16-NEXT:    .cfi_offset w27, -72
-; NOFP16-NEXT:    .cfi_offset w30, -80
+; NOFP16-NEXT:    .cfi_offset w30, -16
 ; NOFP16-NEXT:    mov x19, x0
 ; NOFP16-NEXT:    bl v8f16_result
-; NOFP16-NEXT:    and w0, w0, #0xffff
-; NOFP16-NEXT:    mov w21, w1
-; NOFP16-NEXT:    mov w22, w2
-; NOFP16-NEXT:    mov w23, w3
-; NOFP16-NEXT:    mov w24, w4
-; NOFP16-NEXT:    mov w25, w5
-; NOFP16-NEXT:    mov w26, w6
-; NOFP16-NEXT:    mov w27, w7
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w20, w0
-; NOFP16-NEXT:    and w0, w21, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w21, w0
-; NOFP16-NEXT:    and w0, w22, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w22, w0
-; NOFP16-NEXT:    and w0, w23, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w23, w0
-; NOFP16-NEXT:    and w0, w24, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w24, w0
-; NOFP16-NEXT:    and w0, w25, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w25, w0
-; NOFP16-NEXT:    and w0, w26, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    mov w26, w0
-; NOFP16-NEXT:    and w0, w27, #0xffff
-; NOFP16-NEXT:    bl __gnu_h2f_ieee
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #14]
-; NOFP16-NEXT:    mov w0, w26
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #12]
-; NOFP16-NEXT:    mov w0, w25
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #10]
-; NOFP16-NEXT:    mov w0, w24
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #8]
-; NOFP16-NEXT:    mov w0, w23
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #6]
-; NOFP16-NEXT:    mov w0, w22
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #4]
-; NOFP16-NEXT:    mov w0, w21
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
-; NOFP16-NEXT:    strh w0, [x19, #2]
-; NOFP16-NEXT:    mov w0, w20
-; NOFP16-NEXT:    bl __gnu_f2h_ieee
+; NOFP16-NEXT:    strh w5, [x19, #10]
+; NOFP16-NEXT:    strh w7, [x19, #14]
+; NOFP16-NEXT:    strh w6, [x19, #12]
+; NOFP16-NEXT:    strh w4, [x19, #8]
+; NOFP16-NEXT:    strh w3, [x19, #6]
+; NOFP16-NEXT:    strh w2, [x19, #4]
+; NOFP16-NEXT:    strh w1, [x19, #2]
 ; NOFP16-NEXT:    strh w0, [x19]
-; NOFP16-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT:    ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
 ; NOFP16-NEXT:    ret
   %val = call <8 x half> @v8f16_result()
   store <8 x half> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 47e49b8..d227538 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
 ; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
 ; CHECK-NEXT:    bl bar
 ; CHECK-NEXT:    addvl sp, x29, #-18
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 9851583..3965af6 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #3
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range()  {
 ; CHECK-NEXT:    fmov s7, #7.00000000
 ; CHECK-NEXT:    bl non_sve_callee_high_range
 ; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index f32c80d..4ddf007 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
 ; CHECK-NEXT:    //APP
 ; CHECK-NEXT:    //NO_APP
 ; CHECK-NEXT:    bl non_sve_callee
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2..a592dcd 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
 define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov	p0.b, z0
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
 define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT:    ptrue p1.h
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.h
+; CHECK-NEXT:    pmov	p1.h, z0[0]
+; CHECK-NEXT:    pmov	p2.h, z0[1]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
 define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT:    ptrue p1.s
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.s
+; CHECK-NEXT:    pmov	p1.s, z0[0]
+; CHECK-NEXT:    pmov	p2.s, z0[3]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
 define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
 ; CHECK-LABEL: test_pmov_to_pred_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT:    addvl sp, sp, #-2
-; CHECK-NEXT:    str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT:    .cfi_offset w30, -8
-; CHECK-NEXT:    .cfi_offset w29, -16
-; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    mov z8.d, z0.d
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    mov z0.d, z8.d
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    mov p4.b, p0.b
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT:    ptrue p1.d
-; CHECK-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT:    eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT:    ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    addvl sp, sp, #2
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ptrue	p0.d
+; CHECK-NEXT:    pmov	p1.d, z0[0]
+; CHECK-NEXT:    pmov	p2.d, z0[7]
+; CHECK-NEXT:    eor	p0.b, p0/z, p1.b, p2.b
 ; CHECK-NEXT:    ret
   entry:
   %res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
index 58b240b..b7f36c6 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
@@ -6,12 +6,7 @@
 define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[1], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
 define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #3 // =0x3
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[3], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
 define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    mov w0, #7 // =0x7
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[7], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
 define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0, p0.b
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
 define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i16:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.h
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
 define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.s
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
 define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
 ; CHECK-LABEL: test_pmov_to_vector_zero_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    pmov z0[0], p0.d
 ; CHECK-NEXT:    ret
   entry:
   %res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index f3c4d217..822be14 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,18 +63,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -91,6 +79,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -112,18 +112,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -140,6 +128,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; CHECK-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; CHECK-NEXT:    addvl sp, sp, #18
 ; CHECK-NEXT:    .cfi_def_cfa wsp, 16
 ; CHECK-NEXT:    .cfi_restore z8
@@ -215,18 +215,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -243,6 +231,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
@@ -264,18 +264,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z0, [sp] // 16-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #2
 ; GISEL-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -292,6 +280,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
 ; GISEL-NEXT:    ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
 ; GISEL-NEXT:    ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT:    ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT:    ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
 ; GISEL-NEXT:    addvl sp, sp, #18
 ; GISEL-NEXT:    .cfi_def_cfa wsp, 16
 ; GISEL-NEXT:    .cfi_restore z8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
new file mode 100644
index 0000000..aa72a9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
@@ -0,0 +1,40 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s32) operands.
+---
+name:            test_ptradd_crash__offset_smaller
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_smaller
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s32) = G_CONSTANT i32 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
+
+# Tries to emit a foldable G_PTR_ADD with (p1, s128) operands.
+---
+name:            test_ptradd_crash__offset_wider
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: test_ptradd_crash__offset_wider
+    ; CHECK: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 12
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[C]](p1) :: (load (s32), addrspace 1)
+    ; CHECK-NEXT: $sgpr0 = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
+    %1:_(p1) = G_CONSTANT i64 0
+    %3:_(s128) = G_CONSTANT i128 3
+    %0:_(<4 x s32>) = G_LOAD %1 :: (load (<4 x s32>) from `ptr addrspace(1) null`, addrspace 1)
+    %2:_(s32) = G_EXTRACT_VECTOR_ELT %0, %3
+    $sgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG implicit $sgpr0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
new file mode 100644
index 0000000..6c5339e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fdiv-sqrt-to-rsq.mir
@@ -0,0 +1,584 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+---
+name:            rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT [[INT]](s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FNEG [[INT]]
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f16_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f16_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s16) = G_FCONSTANT half 0xHBC00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_one:_(s16) = G_FCONSTANT half -1.0
+    %rsq:_(s16) = G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            rsq_f16_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract0
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f16_multi_use_missing_contract1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f16_multi_use_missing_contract1
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s16) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s16) = G_FCONSTANT half 0xH3C00
+    ; GCN-NEXT: %rsq:_(s16) = G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    ; GCN-NEXT: S_ENDPGM 0, implicit %sqrt(s16)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %one:_(s16) = G_FCONSTANT half 1.0
+    %rsq:_(s16) = G_FDIV %one, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+    S_ENDPGM 0, implicit %sqrt
+
+...
+
+---
+name:            rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+---
+name:            afn_rsq_f32_multi_use
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f32_multi_use
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s32) = G_FCONSTANT float 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ret:_(s32) = G_FSUB %sqrt, %rsq
+    ; GCN-NEXT: $vgpr0 = COPY %ret(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %one:_(s32) = G_FCONSTANT float 1.0
+    %rsq:_(s32) = contract afn G_FDIV %one, %sqrt
+    %ret:_(s32) = G_FSUB %sqrt, %rsq
+    $vgpr0 = COPY %ret
+
+...
+
+---
+name:            afn_neg_rsq_f32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f32
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: %x:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %sqrt:_(s32) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s32) = G_FCONSTANT float -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: $vgpr0 = COPY %rsq(s32)
+    %x:_(s32) = COPY $vgpr0
+    %sqrt:_(s32) = contract afn G_FSQRT %x
+    %neg_one:_(s32) = G_FCONSTANT float -1.0
+    %rsq:_(s32) = contract afn G_FDIV %neg_one, %sqrt
+    $vgpr0 = COPY %rsq
+
+...
+
+
+---
+name:            rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %one:_(s64) = G_FCONSTANT double 1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %one:_(s64) = G_FCONSTANT double 1.0
+    %rsq:_(s64) = contract afn G_FDIV %one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            afn_neg_rsq_f64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: afn_neg_rsq_f64
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; GCN-NEXT: %sqrt:_(s64) = contract afn G_FSQRT %x
+    ; GCN-NEXT: %neg_one:_(s64) = G_FCONSTANT double -1.000000e+00
+    ; GCN-NEXT: %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    ; GCN-NEXT: %ext:_(s32) = G_TRUNC %rsq(s64)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s64) = G_ANYEXT %0:_(s32)
+    %sqrt:_(s64) = contract afn G_FSQRT %x
+    %neg_one:_(s64) = G_FCONSTANT double -1.0
+    %rsq:_(s64) = contract afn G_FDIV %neg_one, %sqrt
+    %ext:_(s32) = G_TRUNC %rsq:_(s64)
+    $vgpr0 = COPY %ext
+
+...
+
+
+---
+name:            rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %fract:_(s16) = G_FCONSTANT half 0xH3800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %fract:_(s16) = G_FCONSTANT half 0.5
+    %rsq:_(s16) = contract G_FDIV %fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_fract_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_fract_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_fract:_(s16) = G_FCONSTANT half 0xHB800
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_fract
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_fract:_(s16) = G_FCONSTANT half -0.5
+    %rsq:_(s16) = contract G_FDIV %neg_fract, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+
+...
+
+---
+name:            rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %ten:_(s16) = G_FCONSTANT half 0xH4900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %ten:_(s16) = G_FCONSTANT half 10.0
+    %rsq:_(s16) = contract G_FDIV %ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:            neg_rsq_large_num_f16
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GCN-LABEL: name: neg_rsq_large_num_f16
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GCN-NEXT: %x:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GCN-NEXT: %neg_ten:_(s16) = G_FCONSTANT half 0xHC900
+    ; GCN-NEXT: [[INT:%[0-9]+]]:_(s16) = contract G_INTRINSIC intrinsic(@llvm.amdgcn.rsq), %x(s16)
+    ; GCN-NEXT: %rsq:_(s16) = contract G_FMUL [[INT]], %neg_ten
+    ; GCN-NEXT: %ext:_(s32) = G_ANYEXT %rsq(s16)
+    ; GCN-NEXT: $vgpr0 = COPY %ext(s32)
+    %0:_(s32) = COPY $vgpr0
+    %x:_(s16) = G_TRUNC %0:_(s32)
+    %sqrt:_(s16) = contract G_FSQRT %x
+    %neg_ten:_(s16) = G_FCONSTANT half -10.0
+    %rsq:_(s16) = contract G_FDIV %neg_ten, %sqrt
+    %ext:_(s32) = G_ANYEXT %rsq:_(s16)
+    $vgpr0 = COPY %ext
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 63a09e4..8ec7dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2182,9 +2182,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -2199,9 +2198,8 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
@@ -2212,10 +2210,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc_lo
@@ -2226,10 +2223,9 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v0, v[0:1], off
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -2294,7 +2290,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
 ; GFX8-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
 ; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -2323,7 +2318,6 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX9-NEXT:    v_and_or_b32 v5, v1, s8, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v5, s9
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
 ; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    global_store_short_d16_hi v[2:3], v0, off
@@ -2343,14 +2337,13 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX10-NEXT:    v_cmp_nlg_f64_e64 s4, |v[0:1]|, v[4:5]
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s5
 ; GFX10-NEXT:    s_or_b32 vcc_lo, s4, vcc_lo
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1]
 ; GFX10-NEXT:    v_and_or_b32 v5, 0x80000000, v1, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX10-NEXT:    global_store_short_d16_hi v[2:3], v0, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -2369,9 +2362,8 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_cmp_gt_f64_e64 s1, |v[0:1]|, v[4:5]
 ; GFX11-NEXT:    v_cmp_nlg_f64_e64 s0, |v[0:1]|, v[4:5]
 ; GFX11-NEXT:    v_cndmask_b32_e64 v4, -1, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_add_nc_u32_e32 v4, v6, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
@@ -2380,7 +2372,7 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
 ; GFX11-NEXT:    v_bfe_u32 v4, v4, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
 ; GFX11-NEXT:    global_store_d16_hi_b16 v[2:3], v0, off
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -8999,8 +8991,7 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9014,9 +9005,8 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9027,10 +9017,9 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -9042,11 +9031,10 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -9104,16 +9092,14 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9126,20 +9112,18 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9153,14 +9137,13 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -9176,16 +9159,15 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -9255,8 +9237,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -9268,16 +9249,14 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -9293,27 +9272,24 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9331,18 +9307,17 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -9426,17 +9401,15 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -9447,16 +9420,14 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -9471,38 +9442,34 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9523,31 +9490,30 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_add_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v4bf16:
@@ -9555,45 +9521,42 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_add_f32 v3, v7, v6 :: v_dual_add_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -9717,17 +9680,15 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -9738,16 +9699,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -9758,16 +9717,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -9778,16 +9735,14 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -9806,74 +9761,66 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -9890,62 +9837,61 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_add_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_add_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_add_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_add_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_add_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_add_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -9953,81 +9899,80 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fadd_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_add_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_add_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_add_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_add_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_add_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -10036,9 +9981,9 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -10263,16 +10208,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -10283,16 +10226,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -10303,16 +10244,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -10323,16 +10262,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -10343,16 +10280,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -10363,16 +10298,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -10383,16 +10316,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -10403,16 +10334,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -10439,146 +10368,130 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_add_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -10599,27 +10512,26 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_add_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_add_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_add_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -10633,7 +10545,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -10643,10 +10555,10 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -10656,14 +10568,14 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_add_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -10675,12 +10587,12 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -10689,8 +10601,8 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -10702,17 +10614,17 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_add_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -10741,12 +10653,11 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_add_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_add_f32_e32 v6, v6, v14
@@ -10759,13 +10670,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_add_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -10787,32 +10698,32 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_add_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_add_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_add_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -10828,7 +10739,7 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_add_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_add_f32_e32 v3, v3, v11
@@ -10838,13 +10749,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -10861,13 +10772,13 @@ define <16 x bfloat> @v_fadd_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_add_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -11434,16 +11345,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -11465,29 +11374,25 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_add_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -11498,16 +11403,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_add_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -11518,16 +11421,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -11538,16 +11439,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -11558,16 +11457,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -11578,16 +11475,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -11598,16 +11493,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -11618,16 +11511,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -11638,16 +11529,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -11658,16 +11547,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -11678,16 +11565,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -11698,16 +11583,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -11718,16 +11601,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -11738,16 +11619,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -11788,292 +11667,260 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_add_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_add_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_add_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_add_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_add_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_add_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_add_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_add_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_add_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_add_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_add_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_add_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_add_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_add_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_add_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_add_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_add_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_add_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_add_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_add_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_add_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_add_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_add_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_add_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_add_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -12098,7 +11945,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_fadd_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -12163,7 +12010,6 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -12179,10 +12025,10 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_add_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -12220,28 +12066,28 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -12260,12 +12106,12 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -12282,51 +12128,51 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -12334,7 +12180,7 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -12358,14 +12204,14 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_add_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_add_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_add_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -12378,212 +12224,219 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_fadd_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_add_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_add_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_add_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_add_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_add_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_add_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_add_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_add_f32 v23, v66, v65 :: v_dual_add_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_add_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_add_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_add_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_add_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_add_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_add_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_add_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_add_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_add_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_add_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_add_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_add_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_add_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_add_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_add_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_add_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_add_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_add_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_add_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_add_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_add_f32 v30, v36, v35 :: v_dual_add_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -12622,22 +12475,21 @@ define <32 x bfloat> @v_fadd_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_add_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_add_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_add_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -12678,8 +12530,7 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12692,9 +12543,8 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12704,10 +12554,9 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12718,11 +12567,10 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12761,8 +12609,7 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12775,9 +12622,8 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12787,10 +12633,9 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12801,11 +12646,10 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_f32_e32 v0, 0x42280000, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12849,8 +12693,7 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12864,9 +12707,8 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12877,10 +12719,9 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -12892,11 +12733,10 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -12954,16 +12794,14 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -12976,20 +12814,18 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13003,14 +12839,13 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -13026,16 +12861,15 @@ define <2 x bfloat> @v_fsub_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_sub_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -13105,8 +12939,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13118,16 +12951,14 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13143,27 +12974,24 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13181,18 +13009,17 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13276,17 +13103,15 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13297,16 +13122,14 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -13321,38 +13144,34 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_sub_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13373,31 +13192,30 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_sub_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fsub_v4bf16:
@@ -13405,45 +13223,42 @@ define <4 x bfloat> @v_fsub_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_sub_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_sub_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_sub_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_sub_f32 v3, v7, v6 :: v_dual_sub_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fsub <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -13481,8 +13296,7 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13496,9 +13310,8 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13509,10 +13322,9 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -13524,11 +13336,10 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -13586,16 +13397,14 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13608,20 +13417,18 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13635,14 +13442,13 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -13658,16 +13464,15 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -13737,8 +13542,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13750,16 +13554,14 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -13775,27 +13577,24 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -13813,18 +13612,17 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -13908,17 +13706,15 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -13929,16 +13725,14 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -13953,38 +13747,34 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -14005,31 +13795,30 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v4bf16:
@@ -14037,45 +13826,42 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_mul_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_mul_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_mul_f32 v3, v7, v6 :: v_dual_mul_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
@@ -14199,17 +13985,15 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -14220,16 +14004,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -14240,16 +14022,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -14260,16 +14040,14 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -14288,74 +14066,66 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -14372,62 +14142,61 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_mul_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_mul_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_mul_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_mul_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -14435,81 +14204,80 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fmul_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_mul_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_mul_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_mul_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_mul_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_mul_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_mul_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -14518,9 +14286,9 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
@@ -14745,16 +14513,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -14765,16 +14531,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -14785,16 +14549,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -14805,16 +14567,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -14825,16 +14585,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -14845,16 +14603,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -14865,16 +14621,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -14885,16 +14639,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -14921,146 +14673,130 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -15081,27 +14817,26 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_mul_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -15115,7 +14850,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -15125,10 +14860,10 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -15138,14 +14873,14 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -15157,12 +14892,12 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -15171,8 +14906,8 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -15184,17 +14919,17 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_mul_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -15223,12 +14958,11 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_mul_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_mul_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v14
@@ -15241,13 +14975,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -15269,32 +15003,32 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_mul_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_mul_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_mul_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -15310,7 +15044,7 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_mul_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_mul_f32_e32 v3, v3, v11
@@ -15320,13 +15054,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -15343,13 +15077,13 @@ define <16 x bfloat> @v_fmul_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_mul_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -15916,16 +15650,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -15947,29 +15679,25 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_mul_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -15980,16 +15708,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_mul_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -16000,16 +15726,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -16020,16 +15744,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -16040,16 +15762,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -16060,16 +15780,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -16080,16 +15798,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -16100,16 +15816,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -16120,16 +15834,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -16140,16 +15852,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -16160,16 +15870,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -16180,16 +15888,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -16200,16 +15906,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -16220,16 +15924,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -16270,292 +15972,260 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_mul_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_mul_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_mul_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_mul_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_mul_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_mul_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_mul_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_mul_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_mul_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_mul_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_mul_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_mul_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_mul_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_mul_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_mul_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_mul_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_mul_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_mul_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_mul_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_mul_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_mul_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_mul_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_mul_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_mul_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -16580,7 +16250,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_fmul_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -16645,7 +16315,6 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -16661,10 +16330,10 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -16702,28 +16371,28 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -16742,12 +16411,12 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -16764,51 +16433,51 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -16816,7 +16485,7 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -16840,14 +16509,14 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_mul_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_mul_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_mul_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -16860,212 +16529,219 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_fmul_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_mul_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_mul_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_mul_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_mul_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_mul_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_mul_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_mul_f32 v23, v66, v65 :: v_dual_mul_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_mul_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_mul_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_mul_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_mul_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_mul_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_mul_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_mul_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_mul_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_mul_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_mul_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_mul_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_mul_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_mul_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_mul_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_mul_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_mul_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_mul_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_mul_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_mul_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_mul_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_mul_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_mul_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_mul_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_mul_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_mul_f32 v30, v36, v35 :: v_dual_mul_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -17104,22 +16780,21 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_mul_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_mul_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -17194,9 +16869,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17220,9 +16894,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
 ; GFX9-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17235,7 +16908,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_div_scale_f32 v2, s4, v1, v1, v0
 ; GFX10-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v1, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX10-NEXT:    v_fma_f32 v4, -v2, v3, 1.0
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v4, v3
@@ -17246,7 +16918,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    v_div_fmas_f32 v2, v2, v3, v4
 ; GFX10-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -17258,7 +16930,6 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_scale_f32 v2, null, v1, v1, v0
 ; GFX11-NEXT:    v_rcp_f32_e32 v3, v2
@@ -17277,7 +16948,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -17637,8 +17308,7 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17652,9 +17322,8 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17665,10 +17334,9 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -17680,11 +17348,10 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -17750,16 +17417,14 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17772,20 +17437,18 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -17799,14 +17462,13 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -17822,16 +17484,15 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_min_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -17913,8 +17574,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -17926,16 +17586,14 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -17951,27 +17609,24 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -17989,18 +17644,17 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -18100,17 +17754,15 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -18121,16 +17773,14 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -18145,38 +17795,34 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -18197,31 +17843,30 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_min_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v4bf16:
@@ -18229,45 +17874,42 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_min_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_min_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_min_f32 v3, v7, v6 :: v_dual_min_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.minnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -18423,17 +18065,15 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -18444,16 +18084,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -18464,16 +18102,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -18484,16 +18120,14 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -18512,74 +18146,66 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -18596,62 +18222,61 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_min_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_min_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_min_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_min_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_min_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_min_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -18659,81 +18284,80 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_minnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_min_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_min_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_min_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_min_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_min_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -18742,9 +18366,9 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.minnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -19033,16 +18657,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -19053,16 +18675,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -19073,16 +18693,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -19093,16 +18711,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -19113,16 +18729,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -19133,16 +18747,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -19153,16 +18765,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -19173,16 +18783,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -19209,146 +18817,130 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_min_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_min_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_min_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_min_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -19369,27 +18961,26 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_min_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_min_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -19403,7 +18994,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -19413,10 +19004,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_min_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -19426,14 +19017,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_min_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_min_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -19445,12 +19036,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -19459,8 +19050,8 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -19472,17 +19063,17 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_min_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -19511,12 +19102,11 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_min_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_min_f32_e32 v6, v6, v14
@@ -19529,13 +19119,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -19557,32 +19147,32 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_min_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_min_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_min_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -19598,7 +19188,7 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_min_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_min_f32_e32 v3, v3, v11
@@ -19608,13 +19198,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -19631,13 +19221,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_min_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -20332,16 +19922,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -20363,29 +19951,25 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_min_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -20396,16 +19980,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_min_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -20416,16 +19998,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -20436,16 +20016,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -20456,16 +20034,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -20476,16 +20052,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -20496,16 +20070,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -20516,16 +20088,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -20536,16 +20106,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -20556,16 +20124,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -20576,16 +20142,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -20596,16 +20160,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -20616,16 +20178,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -20636,16 +20196,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -20686,292 +20244,260 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_min_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_min_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_min_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_min_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_min_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_min_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_min_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_min_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_min_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_min_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_min_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_min_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_min_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_min_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_min_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_min_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_min_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_min_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_min_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_min_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_min_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_min_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_min_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_min_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_min_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -20996,7 +20522,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_minnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -21061,7 +20587,6 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -21077,10 +20602,10 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_min_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_min_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -21118,28 +20643,28 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -21158,12 +20683,12 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -21180,51 +20705,51 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -21232,7 +20757,7 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -21256,14 +20781,14 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_min_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_min_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_min_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -21276,212 +20801,219 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_minnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_min_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_min_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_min_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_min_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_min_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_min_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_min_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_min_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_min_f32 v23, v66, v65 :: v_dual_min_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_min_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_min_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_min_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_min_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_min_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_min_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_min_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_min_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_min_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_min_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_min_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_min_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_min_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_min_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_min_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_min_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_min_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_min_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_min_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_min_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_min_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_min_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_min_f32 v30, v36, v35 :: v_dual_min_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -21520,22 +21052,21 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_min_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_min_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_min_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -21594,8 +21125,7 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21609,9 +21139,8 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21622,10 +21151,9 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -21637,11 +21165,10 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -21707,16 +21234,14 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21729,20 +21254,18 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v2, v3, v2
-; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -21756,14 +21279,13 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc_lo
@@ -21779,16 +21301,15 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_max_f32_e32 v2, v3, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -21870,8 +21391,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -21883,16 +21403,14 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -21908,27 +21426,24 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -21946,18 +21461,17 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -22057,17 +21571,15 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -22078,16 +21590,14 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -22102,38 +21612,34 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v3, v5, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -22154,31 +21660,30 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX10-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX10-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_max_f32_e32 v3, v7, v6
+; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v4, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v5, s4, 0x400000
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v4bf16:
@@ -22186,45 +21691,42 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_max_f32_e32 v4, v5, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v0, v0, v2 :: v_dual_and_b32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v3, v4, 16, 1
-; GFX11-NEXT:    v_max_f32_e32 v5, v7, v6
-; GFX11-NEXT:    v_and_or_b32 v6, v4, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT:    v_dual_max_f32 v3, v7, v6 :: v_dual_max_f32 v4, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX11-NEXT:    v_add3_u32 v3, v3, v4, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_add3_u32 v4, v7, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v5, s0, 0x400000
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v9, vcc_lo
-; GFX11-NEXT:    v_perm_b32 v0, v0, v4, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_perm_b32 v1, v1, v3, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <4 x bfloat> @llvm.maxnum.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b)
   ret <4 x bfloat> %op
@@ -22380,17 +21882,15 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
@@ -22401,16 +21901,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_bfe_u32 v6, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
@@ -22421,16 +21919,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
@@ -22441,16 +21937,14 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX8-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -22469,74 +21963,66 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v8, v9, v8
-; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v9, v8, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v7, v9, v7
-; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v7
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v7, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v6, v9, v6
-; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v9, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v5, v9, v5
-; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v9, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -22553,62 +22039,61 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_max_f32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
-; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
-; GFX10-NEXT:    v_bfe_u32 v10, v8, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v8, s4, 0x400000
-; GFX10-NEXT:    v_max_f32_e32 v9, v11, v9
-; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX10-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX10-NEXT:    v_max_f32_e32 v7, v10, v9
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX10-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX10-NEXT:    v_bfe_u32 v8, v9, 16, 1
+; GFX10-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v11, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v12, v7, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v12, v9, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
 ; GFX10-NEXT:    v_bfe_u32 v13, v2, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX10-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX10-NEXT:    v_max_f32_e32 v6, v10, v6
+; GFX10-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX10-NEXT:    v_max_f32_e32 v6, v11, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX10-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v11, v2, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v11, v12, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v12, v6, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v5
 ; GFX10-NEXT:    v_max_f32_e32 v5, v15, v13
-; GFX10-NEXT:    v_and_or_b32 v14, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v3
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX10-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v12, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v15, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v11, v5, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v0, s4, 0x400000
-; GFX10-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v0
+; GFX10-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -22616,81 +22101,80 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
-; GFX10-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
+; GFX10-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_maxnum_v8bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v7
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_lshlrev_b32 v9, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v10, v8, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v15, 16, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_dual_max_f32 v8, v9, v8 :: v_dual_and_b32 v7, 0xffff0000, v7
+; GFX11-NEXT:    v_lshlrev_b32_e32 v9, 16, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_max_f32_e32 v9, v11, v9
-; GFX11-NEXT:    v_add3_u32 v10, v10, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_or_b32 v12, v9, s0, 0x400000
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v11, v8, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX11-NEXT:    v_and_or_b32 v7, v8, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v9, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v8, v8, v9, 0x7fff
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v9, v13, v2, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_max_f32_e32 v7, v10, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v10, v11, v8, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc_lo
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v14, v3, s0, 0x400000
-; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX11-NEXT:    v_add3_u32 v10, v11, v3, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v11, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v6, v11, v6 :: v_dual_and_b32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_or_b32 v11, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v12, v7, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v8, v10, v9, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
+; GFX11-NEXT:    v_add3_u32 v9, v11, v3, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v11, v12, v7, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v10, 16, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_cndmask_b32 v7, v11, v12 :: v_dual_max_f32 v2, v2, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    v_bfe_u32 v13, v2, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v6, v10, v6
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v2
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT:    v_add3_u32 v10, v13, v2, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v12, v6, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v9, v11, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v9, v6, s0, 0x400000
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v10, v11, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v13, 16, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_perm_b32 v2, v2, v7, 0x7060302
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_add3_u32 v4, v12, v6, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v2, v2, v8, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc_lo
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v5 :: v_dual_cndmask_b32 v4, v4, v10
 ; GFX11-NEXT:    v_max_f32_e32 v5, v15, v13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v11, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v13, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v15, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_or_b32_e32 v15, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v12, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v5, s0, 0x400000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v6, v11, v1, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v9, v12, v5, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v10, v12, v5, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v12, v13, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v13, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v9, v11, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v12, v13, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -22699,9 +22183,9 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v15, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v10, v14, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v9, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_perm_b32 v3, v3, v7, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <8 x bfloat> @llvm.maxnum.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
   ret <8 x bfloat> %op
@@ -22990,16 +22474,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX8-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
 ; GFX8-NEXT:    v_bfe_u32 v15, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v7
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
@@ -23010,16 +22492,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
 ; GFX8-NEXT:    v_bfe_u32 v14, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v6
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, s4, v14
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
@@ -23030,16 +22510,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_bfe_u32 v13, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v5
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s4, v13
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
@@ -23050,16 +22528,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_bfe_u32 v12, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v4
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, s4, v12
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
@@ -23070,16 +22546,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_bfe_u32 v11, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s4, v11
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
@@ -23090,16 +22564,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_bfe_u32 v10, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v2
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, s4, v10
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
@@ -23110,16 +22582,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v1
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v9
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
@@ -23130,16 +22600,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX8-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v0
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX8-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
@@ -23166,146 +22634,130 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX9-NEXT:    v_max_f32_e32 v16, v17, v16
-; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v17, v16, 16, 1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v16, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v16
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v16, v16
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v15, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v15, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v15, 16, v14
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v6
 ; GFX9-NEXT:    v_max_f32_e32 v15, v17, v15
-; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v15
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v17, v15, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v15, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v15
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v15, v15
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v14
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v14, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v14, v14, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v14, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v14, 16, v13
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v5
 ; GFX9-NEXT:    v_max_f32_e32 v14, v17, v14
-; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v14
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v17, v14, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v13, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v13, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v13, 16, v12
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v4
 ; GFX9-NEXT:    v_max_f32_e32 v13, v17, v13
-; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v13
 ; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v17, v13, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v13
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v12, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v12, v12, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v12, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v12, v17, v12
-; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v12
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v17, v12, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v11, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v11, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v11, 16, v10
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v11, v17, v11
-; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v11
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v17, v11, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v10
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v10, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v10, v10, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v10, v17, v10
-; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v10
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v17, v10, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v9
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v9, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v17, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v9, 16, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v9, v17, v9
-; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v18, 0xff800000, v9
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v17, v9, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v18
+; GFX9-NEXT:    v_or_b32_e32 v18, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_bfe_u32 v8, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v17, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v17, v18, vcc
 ; GFX9-NEXT:    v_add3_u32 v8, v8, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v17
+; GFX9-NEXT:    v_or_b32_e32 v17, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v8, v17, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -23326,27 +22778,26 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_max_f32_e32 v16, v17, v16
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX10-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX10-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX10-NEXT:    v_bfe_u32 v15, v16, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v16, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX10-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v16, v16
 ; GFX10-NEXT:    v_max_f32_e32 v17, v18, v17
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
 ; GFX10-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX10-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v21, v17, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v15, v20, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v16, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX10-NEXT:    v_add3_u32 v14, v21, v17, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
@@ -23360,7 +22811,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v5, v5, v13
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v16, v18, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v13, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v12
 ; GFX10-NEXT:    v_bfe_u32 v20, v17, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
@@ -23370,10 +22821,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v13, vcc_lo
 ; GFX10-NEXT:    v_max_f32_e32 v13, v19, v18
 ; GFX10-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v17, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX10-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v4, v4, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
@@ -23383,14 +22834,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
 ; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v19, v13, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_max_f32_e32 v12, v18, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX10-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
 ; GFX10-NEXT:    v_max_f32_e32 v3, v3, v11
-; GFX10-NEXT:    v_and_or_b32 v22, v12, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX10-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v17, v12, 16, 1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v19, 16, v2
@@ -23402,12 +22853,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v23, v18, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v2, v2, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v12, v17, v22, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v17, v18, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX10-NEXT:    v_add3_u32 v10, v23, v18, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v22, 16, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -23416,8 +22867,8 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
 ; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX10-NEXT:    v_and_or_b32 v18, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -23429,17 +22880,17 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v23, v19, 16, 1
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v9
 ; GFX10-NEXT:    v_max_f32_e32 v9, v22, v20
-; GFX10-NEXT:    v_and_or_b32 v22, v19, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v19
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v8
 ; GFX10-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v19, v19
 ; GFX10-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v24, v9, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v25, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v24, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v22, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v20, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v23, v23, v9, 0x7fff
@@ -23468,12 +22919,11 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v16, 16, v15
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_lshlrev_b32 v17, 16, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_dual_max_f32 v16, v17, v16 :: v_dual_and_b32 v15, 0xffff0000, v15
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v14
 ; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_and_or_b32 v20, v16, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_max_f32_e32 v17, v18, v17
 ; GFX11-NEXT:    v_max_f32_e32 v6, v6, v14
@@ -23486,13 +22936,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX11-NEXT:    v_bfe_u32 v15, v16, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v16, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v16, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v17
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v15, v15, v20 :: v_dual_lshlrev_b32 v20, 16, v5
 ; GFX11-NEXT:    v_bfe_u32 v19, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v18, v19, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v18, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v18, v6, 16, 1
@@ -23514,32 +22964,32 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v12, 16, v11
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_max_f32_e32 v5, v5, v13
-; GFX11-NEXT:    v_and_or_b32 v13, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v13, 0x400000, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, v16, v13 :: v_dual_max_f32 v13, v19, v18
 ; GFX11-NEXT:    v_add3_u32 v16, v20, v17, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v18, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX11-NEXT:    v_perm_b32 v6, v6, v14, 0x7060302
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc_lo
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v3
 ; GFX11-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v5
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX11-NEXT:    v_max_f32_e32 v12, v18, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v19, v21, v5, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v21, v13, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v18, 16, v10
-; GFX11-NEXT:    v_and_or_b32 v22, v12, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v12
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v21, v13, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v19, v13, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v19, 0x400000, v13
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
 ; GFX11-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v21, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v4
 ; GFX11-NEXT:    v_perm_b32 v5, v5, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v17, v19, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v17, v12, 16, 1
@@ -23555,7 +23005,7 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_bfe_u32 v23, v18, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v17, v18, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v17, 0x400000, v18
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_dual_max_f32 v2, v2, v10 :: v_dual_and_b32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_max_f32_e32 v3, v3, v11
@@ -23565,13 +23015,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_bfe_u32 v20, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v19, v20, v3, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v20, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v19, v20, vcc_lo
 ; GFX11-NEXT:    v_bfe_u32 v19, v2, 16, 1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v20, 16, v9
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v18, v18
-; GFX11-NEXT:    v_and_or_b32 v18, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v18, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v12, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v10, v17, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v17, v19, v2, 0x7fff
@@ -23588,13 +23038,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX11-NEXT:    v_max_f32_e32 v9, v22, v20
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add3_u32 v20, v23, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v22, v19, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v25, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v19
+; GFX11-NEXT:    v_or_b32_e32 v25, 0x400000, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v23, v9, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v24, v9, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v24, 0x400000, v9
 ; GFX11-NEXT:    v_cndmask_b32_e32 v19, v20, v22, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v22, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v22, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_bfe_u32 v20, v0, 16, 1
@@ -24289,16 +23739,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX8-NEXT:    v_add_u32_e32 v32, vcc, s4, v32
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX8-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
 ; GFX8-NEXT:    v_bfe_u32 v30, v14, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, v30, v14
 ; GFX8-NEXT:    v_add_u32_e32 v30, vcc, s4, v30
-; GFX8-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
-; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX8-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX8-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
@@ -24320,29 +23768,25 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_max_f32_e32 v30, v15, v30
 ; GFX8-NEXT:    v_bfe_u32 v15, v33, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v33
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s4, v15
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
 ; GFX8-NEXT:    v_bfe_u32 v33, v30, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v30
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v30
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v30
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
 ; GFX8-NEXT:    v_cndmask_b32_e32 v30, v33, v34, vcc
 ; GFX8-NEXT:    v_bfe_u32 v33, v32, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, v33, v32
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
 ; GFX8-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, v29, v13
 ; GFX8-NEXT:    v_add_u32_e32 v29, vcc, s4, v29
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v13
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
 ; GFX8-NEXT:    v_cndmask_b32_e32 v13, v29, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
@@ -24353,16 +23797,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v29
 ; GFX8-NEXT:    v_max_f32_e32 v12, v12, v28
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
 ; GFX8-NEXT:    v_bfe_u32 v28, v12, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, v28, v12
 ; GFX8-NEXT:    v_add_u32_e32 v28, vcc, s4, v28
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
@@ -24373,16 +23815,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX8-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
 ; GFX8-NEXT:    v_bfe_u32 v27, v11, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, v27, v11
 ; GFX8-NEXT:    v_add_u32_e32 v27, vcc, s4, v27
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
@@ -24393,16 +23833,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX8-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
 ; GFX8-NEXT:    v_bfe_u32 v26, v10, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, v26, v10
 ; GFX8-NEXT:    v_add_u32_e32 v26, vcc, s4, v26
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
@@ -24413,16 +23851,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX8-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
 ; GFX8-NEXT:    v_bfe_u32 v25, v9, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, v25, v9
 ; GFX8-NEXT:    v_add_u32_e32 v25, vcc, s4, v25
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
@@ -24433,16 +23869,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX8-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
 ; GFX8-NEXT:    v_bfe_u32 v24, v8, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, v24, v8
 ; GFX8-NEXT:    v_add_u32_e32 v24, vcc, s4, v24
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
@@ -24453,16 +23887,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX8-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
 ; GFX8-NEXT:    v_bfe_u32 v23, v7, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, v23, v7
 ; GFX8-NEXT:    v_add_u32_e32 v23, vcc, s4, v23
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
@@ -24473,16 +23905,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX8-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
 ; GFX8-NEXT:    v_bfe_u32 v22, v6, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, v22, v6
 ; GFX8-NEXT:    v_add_u32_e32 v22, vcc, s4, v22
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
@@ -24493,16 +23923,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX8-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
 ; GFX8-NEXT:    v_bfe_u32 v21, v5, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, v21, v5
 ; GFX8-NEXT:    v_add_u32_e32 v21, vcc, s4, v21
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
@@ -24513,16 +23941,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX8-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
 ; GFX8-NEXT:    v_bfe_u32 v20, v4, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, v20, v4
 ; GFX8-NEXT:    v_add_u32_e32 v20, vcc, s4, v20
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
@@ -24533,16 +23959,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
 ; GFX8-NEXT:    v_bfe_u32 v19, v3, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, v19, v3
 ; GFX8-NEXT:    v_add_u32_e32 v19, vcc, s4, v19
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
@@ -24553,16 +23977,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
 ; GFX8-NEXT:    v_bfe_u32 v18, v2, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, v18, v2
 ; GFX8-NEXT:    v_add_u32_e32 v18, vcc, s4, v18
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
@@ -24573,16 +23995,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
 ; GFX8-NEXT:    v_bfe_u32 v17, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, v17, v1
 ; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s4, v17
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
@@ -24593,16 +24013,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v33, vcc, s4, v33
-; GFX8-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX8-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
 ; GFX8-NEXT:    v_bfe_u32 v16, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, v16, v0
 ; GFX8-NEXT:    v_add_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX8-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -24643,292 +24061,260 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v31, 16, v30
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v14
 ; GFX9-NEXT:    v_max_f32_e32 v31, v32, v31
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v31
 ; GFX9-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX9-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX9-NEXT:    v_bfe_u32 v32, v31, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v31, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v31
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v31, v31
-; GFX9-NEXT:    v_max_f32_e32 v14, v14, v30
-; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_bfe_u32 v30, v14, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v32, 0xff800000, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v31, v32, v33, vcc
 ; GFX9-NEXT:    v_add3_u32 v30, v30, v14, s4
-; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v32
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v14
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v14, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v30, v32, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v30, 16, v29
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v13
+; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
-; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
-; GFX9-NEXT:    v_max_f32_e32 v30, v32, v30
 ; GFX9-NEXT:    v_bfe_u32 v32, v30, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v30
+; GFX9-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX9-NEXT:    v_add3_u32 v32, v32, v30, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v30
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v30, v30
+; GFX9-NEXT:    v_bfe_u32 v29, v13, 16, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v30, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX9-NEXT:    v_add3_u32 v29, v29, v13, s4
+; GFX9-NEXT:    v_or_b32_e32 v32, 0x400000, v13
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v29, v32, vcc
+; GFX9-NEXT:    v_lshlrev_b32_e32 v29, 16, v28
+; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v12
+; GFX9-NEXT:    v_max_f32_e32 v32, v32, v29
+; GFX9-NEXT:    buffer_load_dword v29, off, s[0:3], s32
+; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v15
 ; GFX9-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
+; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
+; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
+; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v29
-; GFX9-NEXT:    v_max_f32_e32 v32, v32, v33
+; GFX9-NEXT:    v_lshlrev_b32_e32 v34, 16, v29
+; GFX9-NEXT:    v_max_f32_e32 v33, v33, v34
 ; GFX9-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX9-NEXT:    v_max_f32_e32 v29, v15, v29
-; GFX9-NEXT:    v_bfe_u32 v15, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v32
-; GFX9-NEXT:    v_add3_u32 v15, v15, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v29, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v29
-; GFX9-NEXT:    v_add3_u32 v32, v32, v29, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_bfe_u32 v15, v33, 16, 1
+; GFX9-NEXT:    v_add3_u32 v15, v15, v33, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v33
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v33, v33
+; GFX9-NEXT:    v_bfe_u32 v33, v29, 16, 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v15, v34, vcc
+; GFX9-NEXT:    v_add3_u32 v33, v33, v29, s4
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v29
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v29, v29
-; GFX9-NEXT:    v_cndmask_b32_e32 v29, v32, v33, vcc
-; GFX9-NEXT:    v_bfe_u32 v32, v13, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v13
-; GFX9-NEXT:    v_add3_u32 v32, v32, v13, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v13, v13
-; GFX9-NEXT:    v_cndmask_b32_e32 v13, v32, v33, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e32 v32, 16, v28
-; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v12
-; GFX9-NEXT:    v_max_f32_e32 v32, v33, v32
+; GFX9-NEXT:    v_cndmask_b32_e32 v29, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v33, v32, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v32
-; GFX9-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v32, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v32
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v32, v32
-; GFX9-NEXT:    v_max_f32_e32 v12, v12, v28
 ; GFX9-NEXT:    v_cndmask_b32_e32 v32, v33, v34, vcc
-; GFX9-NEXT:    v_bfe_u32 v28, v12, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v12
-; GFX9-NEXT:    v_add3_u32 v28, v28, v12, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v12
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v12, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v12, v28, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v28, 16, v27
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v11
 ; GFX9-NEXT:    v_max_f32_e32 v28, v33, v28
-; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v28
 ; GFX9-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
+; GFX9-NEXT:    v_bfe_u32 v33, v28, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v28, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v28
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v28, v28
-; GFX9-NEXT:    v_max_f32_e32 v11, v11, v27
-; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v27, v11, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v28, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v27, v27, v11, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v11
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v11, v11
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v27, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v27, 16, v26
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v10
 ; GFX9-NEXT:    v_max_f32_e32 v27, v33, v27
-; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v27
 ; GFX9-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
+; GFX9-NEXT:    v_bfe_u32 v33, v27, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v27, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v27
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v27, v27
-; GFX9-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v26, v10, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v10
+; GFX9-NEXT:    v_cndmask_b32_e32 v27, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v26, v26, v10, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v10
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v10, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v26, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v26, 16, v25
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v9
 ; GFX9-NEXT:    v_max_f32_e32 v26, v33, v26
-; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v26
 ; GFX9-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX9-NEXT:    v_bfe_u32 v33, v26, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v26, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v26
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v26, v26
-; GFX9-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v25, v9, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v9
+; GFX9-NEXT:    v_cndmask_b32_e32 v26, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v25, v25, v9, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v9, v9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v25, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v25, 16, v24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v8
 ; GFX9-NEXT:    v_max_f32_e32 v25, v33, v25
-; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v25
 ; GFX9-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX9-NEXT:    v_bfe_u32 v33, v25, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v25, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v25
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v25, v25
-; GFX9-NEXT:    v_max_f32_e32 v8, v8, v24
-; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v24, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v25, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v24, v24, v8, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v24, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v24, 16, v23
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v7
 ; GFX9-NEXT:    v_max_f32_e32 v24, v33, v24
-; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v24
 ; GFX9-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX9-NEXT:    v_bfe_u32 v33, v24, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v24, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v24
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v24, v24
-; GFX9-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v23, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v24, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v23, v23, v7, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v23, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v23, 16, v22
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v6
 ; GFX9-NEXT:    v_max_f32_e32 v23, v33, v23
-; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v23
 ; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX9-NEXT:    v_bfe_u32 v33, v23, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v23, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v23
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v23, v23
-; GFX9-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v22, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v23, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v22, v22, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v22, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v22, 16, v21
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v5
 ; GFX9-NEXT:    v_max_f32_e32 v22, v33, v22
-; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v22
 ; GFX9-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_bfe_u32 v33, v22, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v22, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v22
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v22, v22
-; GFX9-NEXT:    v_max_f32_e32 v5, v5, v21
-; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v21, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v22, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v21, v21, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v5
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v21, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v21, 16, v20
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v4
 ; GFX9-NEXT:    v_max_f32_e32 v21, v33, v21
-; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v21
 ; GFX9-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_bfe_u32 v33, v21, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v21, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v21
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v21, v21
-; GFX9-NEXT:    v_max_f32_e32 v4, v4, v20
-; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v20, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v21, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v20, v20, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v20, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v20, 16, v19
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v3
 ; GFX9-NEXT:    v_max_f32_e32 v20, v33, v20
-; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v20
 ; GFX9-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_bfe_u32 v33, v20, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v20, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v20
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v20, v20
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v19
-; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v19, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v20, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v19, v19, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v19, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v19, 16, v18
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v2
 ; GFX9-NEXT:    v_max_f32_e32 v19, v33, v19
-; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v19
 ; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX9-NEXT:    v_bfe_u32 v33, v19, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v19, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v19
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v19, v19
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v18, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v19, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v18, v18, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v18, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v18, 16, v17
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v1
 ; GFX9-NEXT:    v_max_f32_e32 v18, v33, v18
-; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v18
 ; GFX9-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v33, v18, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v18, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v18
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v18, v18
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v17
-; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v17, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v18, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v17, v17, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v17, v33, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v17, 16, v16
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v33, 16, v0
 ; GFX9-NEXT:    v_max_f32_e32 v17, v33, v17
-; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v34, 0xff800000, v17
 ; GFX9-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v33, v17, 16, 1
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
 ; GFX9-NEXT:    v_add3_u32 v33, v33, v17, s4
-; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v34
+; GFX9-NEXT:    v_or_b32_e32 v34, 0x400000, v17
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v17, v17
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_bfe_u32 v16, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v33, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v17, v33, v34, vcc
 ; GFX9-NEXT:    v_add3_u32 v16, v16, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v33
+; GFX9-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v16, v33, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -24953,7 +24339,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-LABEL: v_maxnum_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v39, 16, v27
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
 ; GFX10-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
@@ -25018,7 +24404,6 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v55, v11, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v65, v49, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v67, v10, 16, 1
-; GFX10-NEXT:    s_mov_b32 s23, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v33, 16, v30
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v34, 16, v14
 ; GFX10-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
@@ -25034,10 +24419,10 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GFX10-NEXT:    v_max_f32_e32 v17, v26, v50
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v16
-; GFX10-NEXT:    v_and_or_b32 v54, v39, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v64, v11, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v66, v49, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v68, v10, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v54, 0x400000, v39
+; GFX10-NEXT:    v_or_b32_e32 v64, 0x400000, v11
+; GFX10-NEXT:    v_or_b32_e32 v66, 0x400000, v49
+; GFX10-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v39, v39
 ; GFX10-NEXT:    v_add3_u32 v39, v53, v39, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v11, v11
@@ -25075,28 +24460,28 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v27, v14, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v29, v35, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v34, v13, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v48, v37, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v52, v12, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v48, 0x400000, v37
+; GFX10-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v37, v37
 ; GFX10-NEXT:    v_add3_u32 v37, v38, v37, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v12, v12
 ; GFX10-NEXT:    v_add3_u32 v12, v50, v12, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s10, v18, v18
 ; GFX10-NEXT:    v_add3_u32 v54, v54, v18, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v18, v18, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v18, 0x400000, v18
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s11, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v64, v64, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v1, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s12, v17, v17
 ; GFX10-NEXT:    v_add3_u32 v66, v66, v17, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v17, v17, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v17, 0x400000, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s22, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v68, v68, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v0, v0, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v26, v33, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v28, v14, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v30, v35, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v36, v13, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v0, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v26, 0x400000, v33
+; GFX10-NEXT:    v_or_b32_e32 v28, 0x400000, v14
+; GFX10-NEXT:    v_or_b32_e32 v30, 0x400000, v35
+; GFX10-NEXT:    v_or_b32_e32 v36, 0x400000, v13
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
 ; GFX10-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v33, v51, 16, 1
@@ -25115,12 +24500,12 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v17, v66, v17, s12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v68, v0, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v1, s11
-; GFX10-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
+; GFX10-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
 ; GFX10-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX10-NEXT:    v_and_or_b32 v27, v51, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v27, 0x400000, v51
 ; GFX10-NEXT:    v_bfe_u32 v35, v9, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v38, v25, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v67, v24, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v67, 0x400000, v24
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s13, v51, v51
 ; GFX10-NEXT:    v_add3_u32 v33, v33, v51, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v51, v7, 16, 1
@@ -25137,51 +24522,51 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_bfe_u32 v36, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s8, v19, v19
 ; GFX10-NEXT:    v_add3_u32 v48, v48, v19, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v19, v19, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v19, 0x400000, v19
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s9, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v52, v52, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v2, v2, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v18, 0x7060302
-; GFX10-NEXT:    v_and_or_b32 v34, v9, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v50, v25, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v34, 0x400000, v9
+; GFX10-NEXT:    v_or_b32_e32 v50, 0x400000, v25
 ; GFX10-NEXT:    v_bfe_u32 v53, v8, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s14, v9, v9
 ; GFX10-NEXT:    v_add3_u32 v9, v35, v9, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v35, v7, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v35, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s15, v25, v25
 ; GFX10-NEXT:    v_add3_u32 v25, v38, v25, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v23, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s18, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v7, v51, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v51, v6, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v51, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s20, v6, v6
 ; GFX10-NEXT:    v_add3_u32 v6, v65, v6, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v65, v5, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v21, v21
 ; GFX10-NEXT:    v_add3_u32 v26, v26, v21, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v21, v21, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v21
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s5, v4, v4
 ; GFX10-NEXT:    v_add3_u32 v28, v28, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v4, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s6, v20, v20
 ; GFX10-NEXT:    v_add3_u32 v30, v30, v20, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v20, v20, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v20
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s7, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v36, v36, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v3, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v19, v48, v19, s8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v52, v2, s9
-; GFX10-NEXT:    v_and_or_b32 v55, v8, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v55, 0x400000, v8
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s16, v8, v8
 ; GFX10-NEXT:    v_add3_u32 v8, v53, v8, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v53, v23, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v53, 0x400000, v23
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s19, v23, v23
 ; GFX10-NEXT:    v_add3_u32 v23, v38, v23, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v38, v22, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
 ; GFX10-NEXT:    v_add3_u32 v65, v65, v5, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v5, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v21, v26, v21, s4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, v28, v4, s5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v20, v30, v20, s6
@@ -25189,7 +24574,7 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s21, v22, v22
 ; GFX10-NEXT:    v_add3_u32 v38, v38, v22, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v22, v22, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v22, 0x400000, v22
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v65, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v4, v4, v21, 0x7060302
@@ -25213,14 +24598,14 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX10-NEXT:    v_perm_b32 v13, v13, v29, 0x7060302
 ; GFX10-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX10-NEXT:    v_max_f32_e32 v17, v32, v17
+; GFX10-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX10-NEXT:    v_and_b32_e32 v18, 0xffff0000, v32
+; GFX10-NEXT:    v_max_f32_e32 v17, v31, v17
 ; GFX10-NEXT:    v_max_f32_e32 v15, v15, v18
 ; GFX10-NEXT:    v_bfe_u32 v18, v17, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v20, v17, s23, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v21, v15, s23, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v20, 0x400000, v17
+; GFX10-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
 ; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v15, v15
 ; GFX10-NEXT:    v_add3_u32 v17, v18, v17, 0x7fff
@@ -25233,212 +24618,219 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-LABEL: v_maxnum_v32bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    scratch_load_b32 v31, off, s32
+; GFX11-NEXT:    scratch_load_b32 v32, off, s32
+; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
+; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
+; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v83, 16, v17
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v84, 16, v1
 ; GFX11-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v49, 16, v26
+; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_and_b32 v26, 0xffff0000, v26
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v53, 16, v24
-; GFX11-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_lshlrev_b32 v64, 16, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v1, v1, v17 :: v_dual_and_b32 v24, 0xffff0000, v24
+; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
+; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
+; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v81, 16, v18
-; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
-; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
 ; GFX11-NEXT:    v_bfe_u32 v135, v1, 16, 1
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
-; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX11-NEXT:    v_and_or_b32 v144, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v112, 0x400000, v5
+; GFX11-NEXT:    v_or_b32_e32 v144, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_add3_u32 v135, v135, v1, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v82, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v54, 16, v8
-; GFX11-NEXT:    v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v8, 0xffff0000, v8
-; GFX11-NEXT:    v_dual_max_f32 v7, v7, v23 :: v_dual_lshlrev_b32 v36, 16, v13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
-; GFX11-NEXT:    v_dual_max_f32 v0, v0, v16 :: v_dual_lshlrev_b32 v49, 16, v26
-; GFX11-NEXT:    v_max_f32_e32 v24, v64, v55
-; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_lshlrev_b32 v54, 16, v8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v85, 16, v16
+; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_lshlrev_b32 v64, 16, v7
+; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v65, 16, v22
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v66, 16, v6
-; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v67, 16, v21
-; GFX11-NEXT:    v_lshlrev_b32_e32 v68, 16, v5
-; GFX11-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v130, 0x400000, v19
+; GFX11-NEXT:    v_lshlrev_b32_e32 v48, 16, v11
+; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v51, 16, v25
+; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
+; GFX11-NEXT:    v_lshlrev_b32_e32 v86, 16, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_dual_max_f32 v17, v86, v85 :: v_dual_and_b32 v2, 0xffff0000, v2
+; GFX11-NEXT:    v_dual_max_f32 v8, v8, v24 :: v_dual_lshlrev_b32 v39, 16, v27
+; GFX11-NEXT:    v_or_b32_e32 v128, 0x400000, v3
+; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v146, 0x400000, v17
+; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
+; GFX11-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v70, 16, v4
-; GFX11-NEXT:    v_and_or_b32 v86, v24, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v96, v7, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX11-NEXT:    v_lshlrev_b32_e32 v55, 16, v23
+; GFX11-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
+; GFX11-NEXT:    v_lshlrev_b32_e32 v50, 16, v10
+; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
+; GFX11-NEXT:    v_max_f32_e32 v0, v0, v16
+; GFX11-NEXT:    v_dual_max_f32 v24, v64, v55 :: v_dual_lshlrev_b32 v37, 16, v28
+; GFX11-NEXT:    v_max_f32_e32 v7, v7, v23
+; GFX11-NEXT:    v_dual_max_f32 v23, v66, v65 :: v_dual_max_f32 v18, v84, v83
+; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v28, 0xffff0000, v28
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_bfe_u32 v85, v24, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v86, 0x400000, v24
+; GFX11-NEXT:    v_or_b32_e32 v98, 0x400000, v23
+; GFX11-NEXT:    v_bfe_u32 v87, v7, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v85, v85, v24, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v69, 16, v20
-; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX11-NEXT:    v_dual_max_f32 v23, v66, v65 :: v_dual_lshlrev_b32 v48, 16, v11
-; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX11-NEXT:    v_dual_max_f32 v5, v5, v21 :: v_dual_lshlrev_b32 v50, 16, v10
-; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v26, 0xffff0000, v26
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
-; GFX11-NEXT:    v_lshlrev_b32_e32 v52, 16, v9
-; GFX11-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX11-NEXT:    v_bfe_u32 v97, v23, 16, 1
-; GFX11-NEXT:    v_max_f32_e32 v2, v2, v18
-; GFX11-NEXT:    v_max_f32_e32 v18, v84, v83
-; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v103, v5, 16, 1
-; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v71, 16, v19
-; GFX11-NEXT:    v_and_or_b32 v84, v8, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v98, v23, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v100, v6, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v112, v5, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v114, v21, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
-; GFX11-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v103, v103, v5, 0x7fff
-; GFX11-NEXT:    v_lshlrev_b32_e32 v80, 16, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v97, v97, v23, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX11-NEXT:    v_lshlrev_b32_e32 v38, 16, v12
 ; GFX11-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX11-NEXT:    v_dual_max_f32 v3, v3, v19 :: v_dual_and_b32 v10, 0xffff0000, v10
-; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v51, 16, v25
-; GFX11-NEXT:    v_lshlrev_b32_e32 v37, 16, v28
-; GFX11-NEXT:    v_dual_max_f32 v4, v4, v20 :: v_dual_and_b32 v25, 0xffff0000, v25
+; GFX11-NEXT:    v_or_b32_e32 v96, 0x400000, v7
+; GFX11-NEXT:    v_add3_u32 v87, v87, v7, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX11-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX11-NEXT:    v_max_f32_e32 v20, v80, v71
-; GFX11-NEXT:    v_dual_max_f32 v19, v82, v81 :: v_dual_and_b32 v28, 0xffff0000, v28
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v9, v9, v25 :: v_dual_and_b32 v12, 0xffff0000, v12
-; GFX11-NEXT:    v_max_f32_e32 v25, v54, v53
+; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v80, 0x400000, v9
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v35, 16, v29
-; GFX11-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v13, 0xffff0000, v13
-; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_lshlrev_b32 v33, 16, v30
-; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_lshlrev_b32 v34, 16, v14
-; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_and_b32 v14, 0xffff0000, v14
-; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_max_f32 v26, v52, v51
-; GFX11-NEXT:    v_dual_max_f32 v29, v38, v37 :: v_dual_and_b32 v30, 0xffff0000, v30
-; GFX11-NEXT:    v_lshlrev_b32_e32 v32, 16, v15
-; GFX11-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
+; GFX11-NEXT:    v_dual_max_f32 v21, v70, v69 :: v_dual_and_b32 v10, 0xffff0000, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX11-NEXT:    v_dual_max_f32 v10, v10, v26 :: v_dual_and_b32 v29, 0xffff0000, v29
+; GFX11-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
+; GFX11-NEXT:    v_max_f32_e32 v26, v52, v51
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX11-NEXT:    v_lshlrev_b32_e32 v36, 16, v13
+; GFX11-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GFX11-NEXT:    v_dual_max_f32 v11, v11, v27 :: v_dual_lshlrev_b32 v34, 16, v14
+; GFX11-NEXT:    v_dual_max_f32 v22, v68, v67 :: v_dual_lshlrev_b32 v33, 16, v30
+; GFX11-NEXT:    v_dual_max_f32 v27, v50, v49 :: v_dual_lshlrev_b32 v38, 16, v12
+; GFX11-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
+; GFX11-NEXT:    v_dual_max_f32 v25, v54, v53 :: v_dual_and_b32 v12, 0xffff0000, v12
+; GFX11-NEXT:    v_dual_max_f32 v13, v13, v29 :: v_dual_and_b32 v30, 0xffff0000, v30
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_max_f32_e32 v29, v38, v37
+; GFX11-NEXT:    v_lshlrev_b32_e32 v31, 16, v15
+; GFX11-NEXT:    v_dual_max_f32 v12, v12, v28 :: v_dual_and_b32 v15, 0xffff0000, v15
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_max_f32_e32 v14, v14, v30
+; GFX11-NEXT:    v_max_f32_e32 v28, v48, v39
 ; GFX11-NEXT:    v_dual_max_f32 v30, v36, v35 :: v_dual_max_f32 v33, v34, v33
-; GFX11-NEXT:    v_and_or_b32 v48, v13, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v39, v13, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v35, v14, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v36, v14, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v36, 0x400000, v14
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v16, v33, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v34, v33, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v34, 0x400000, v33
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v33, v33
-; GFX11-NEXT:    v_bfe_u32 v37, v30, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v35, v35, v14, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v38, 0x400000, v30
 ; GFX11-NEXT:    v_add3_u32 v16, v16, v33, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v38, v30, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v37, v37, v30, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v50, v29, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v48, 0x400000, v13
+; GFX11-NEXT:    v_bfe_u32 v49, v29, 16, 1
+; GFX11-NEXT:    v_add3_u32 v39, v39, v13, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v16, v16, v34, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v14, v14
+; GFX11-NEXT:    v_or_b32_e32 v50, 0x400000, v29
 ; GFX11-NEXT:    v_bfe_u32 v51, v12, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v49, v49, v29, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v52, v12, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v52, 0x400000, v12
 ; GFX11-NEXT:    v_cndmask_b32_e32 v14, v35, v36, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v30, v30
+; GFX11-NEXT:    v_bfe_u32 v53, v28, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v51, v51, v12, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v54, v28, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v54, 0x400000, v28
 ; GFX11-NEXT:    v_bfe_u32 v55, v11, 16, 1
-; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v30, v37, v38, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v13, v13
-; GFX11-NEXT:    v_and_or_b32 v64, v11, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v53, v53, v28, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v64, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v65, v27, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v55, v55, v11, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v66, v27, s0, 0x400000
 ; GFX11-NEXT:    v_cndmask_b32_e32 v13, v39, v48, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v29, v29
+; GFX11-NEXT:    v_or_b32_e32 v66, 0x400000, v27
 ; GFX11-NEXT:    v_bfe_u32 v67, v10, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v65, v65, v27, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v68, v10, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v68, 0x400000, v10
 ; GFX11-NEXT:    v_cndmask_b32_e32 v29, v49, v50, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v12, v12
+; GFX11-NEXT:    v_bfe_u32 v69, v26, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v67, v67, v10, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v70, v26, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v71, v9, 16, 1
-; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v70, 0x400000, v26
+; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v12, v51, v52, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v28, v28
-; GFX11-NEXT:    v_and_or_b32 v80, v9, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v81, v25, 16, 1
-; GFX11-NEXT:    v_add3_u32 v71, v71, v9, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v82, v25, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v69, v69, v26, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v82, 0x400000, v25
+; GFX11-NEXT:    v_bfe_u32 v83, v8, 16, 1
+; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v28, v53, v54, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
-; GFX11-NEXT:    v_add3_u32 v81, v81, v25, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v102, v22, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v84, 0x400000, v8
+; GFX11-NEXT:    v_add3_u32 v83, v83, v8, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v99, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v100, 0x400000, v6
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v55, v64, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v27, v27
-; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v116, v4, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
-; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v101, v22, 16, 1
+; GFX11-NEXT:    v_add3_u32 v99, v99, v6, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v102, 0x400000, v22
+; GFX11-NEXT:    v_bfe_u32 v113, v21, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v27, v65, v66, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v10, v10
-; GFX11-NEXT:    v_and_or_b32 v118, v20, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v129, v19, 16, 1
-; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v130, v19, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v101, v101, v22, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v114, 0x400000, v21
+; GFX11-NEXT:    v_bfe_u32 v115, v4, 16, 1
+; GFX11-NEXT:    v_add3_u32 v113, v113, v21, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v67, v68, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v26, v26
-; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
-; GFX11-NEXT:    v_add3_u32 v129, v129, v19, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v134, v18, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v145, v17, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v116, 0x400000, v4
+; GFX11-NEXT:    v_bfe_u32 v117, v20, 16, 1
+; GFX11-NEXT:    v_add3_u32 v115, v115, v4, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v118, 0x400000, v20
 ; GFX11-NEXT:    v_cndmask_b32_e32 v26, v69, v70, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v9, v9
-; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v146, v17, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v117, v117, v20, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v133, v18, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v134, 0x400000, v18
 ; GFX11-NEXT:    v_bfe_u32 v147, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v145, v145, v17, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v71, v80, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v25, v25
-; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v33, v0, s0, 0x400000
+; GFX11-NEXT:    v_add3_u32 v133, v133, v18, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v33, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v147, v147, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v132, v2, s0, 0x400000
+; GFX11-NEXT:    v_bfe_u32 v131, v2, 16, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v25, v81, v82, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v8, v8
-; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v119, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v128, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v132, 0x400000, v2
 ; GFX11-NEXT:    v_perm_b32 v9, v9, v26, 0x7060302
+; GFX11-NEXT:    v_add3_u32 v131, v131, v2, 0x7fff
+; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v83, v84, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v24, v24
-; GFX11-NEXT:    v_add3_u32 v119, v119, v3, 0x7fff
-; GFX11-NEXT:    v_perm_b32 v10, v10, v27, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v11, v11, v28, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
+; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v8, v8, v25, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v24, v85, v86, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX11-NEXT:    v_perm_b32 v12, v12, v29, 0x7060302
-; GFX11-NEXT:    v_perm_b32 v13, v13, v30, 0x7060302
 ; GFX11-NEXT:    v_perm_b32 v14, v14, v16, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v87, v96, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
@@ -25477,22 +24869,21 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v17, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v131, v132, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v119, v128, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_perm_b32 v3, v3, v20, 0x7060302
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v31
-; GFX11-NEXT:    v_and_b32_e32 v18, 0xffff0000, v31
-; GFX11-NEXT:    v_perm_b32 v2, v2, v19, 0x7060302
-; GFX11-NEXT:    v_max_f32_e32 v17, v32, v17
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v17, 16, v32
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_max_f32 v17, v31, v17 :: v_dual_and_b32 v18, 0xffff0000, v32
 ; GFX11-NEXT:    v_max_f32_e32 v15, v15, v18
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v18, v17, 16, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v19, v15, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v20, v17, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v20, 0x400000, v17
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v17, v17
-; GFX11-NEXT:    v_and_or_b32 v21, v15, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v21, 0x400000, v15
 ; GFX11-NEXT:    v_add3_u32 v18, v18, v17, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v19, v19, v15, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -25586,8 +24977,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25617,9 +25007,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25641,13 +25030,12 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s4
 ; GFX10-NEXT:    v_cmp_lt_f32_e64 s4, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_class_f32_e64 vcc_lo, v0, 0x260
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -25674,9 +25062,8 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v2, s0
 ; GFX11-NEXT:    v_cmp_lt_f32_e64 s0, 0, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_mul_f32_e32 v2, 0x37800000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
@@ -25684,7 +25071,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -25724,8 +25111,7 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25738,9 +25124,8 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25750,10 +25135,9 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -25764,11 +25148,10 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -25816,8 +25199,7 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25831,9 +25213,8 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX9-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25844,11 +25225,10 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_frexp_mant_f32_e32 v0, v1
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
@@ -25947,8 +25327,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25978,9 +25357,8 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -25990,7 +25368,6 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -26004,7 +25381,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26015,7 +25392,6 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -26035,7 +25411,7 @@ define bfloat @v_log_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26097,8 +25473,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26119,9 +25494,8 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26131,7 +25505,6 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 0x42000000, vcc_lo
@@ -26139,7 +25512,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_log_f32_e32 v0, v0
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26150,7 +25523,6 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0x4f800000, vcc_lo
@@ -26161,7 +25533,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26257,8 +25629,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26288,9 +25659,8 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26300,7 +25670,6 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
@@ -26314,7 +25683,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26325,7 +25694,6 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0x800000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x4f800000, vcc_lo
@@ -26345,7 +25713,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_sub_f32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26442,8 +25810,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26473,9 +25840,8 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26485,7 +25851,6 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc2ce8ed0, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26500,7 +25865,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x42b17218, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26511,7 +25876,6 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
 ; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26532,7 +25896,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26594,8 +25958,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26616,9 +25979,8 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26628,7 +25990,6 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0x1f800000, vcc_lo
@@ -26636,7 +25997,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26647,7 +26008,6 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0xc2fc0000, v0
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 0x42800000, vcc_lo
@@ -26658,7 +26018,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26752,8 +26112,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26783,9 +26142,8 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26795,7 +26153,6 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, 0xc23369f4, v0
 ; GFX10-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26810,7 +26167,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_cmp_nlt_f32_e32 vcc_lo, 0x421a209b, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26821,7 +26178,6 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v1, 0x40549a78, v0
 ; GFX11-NEXT:    v_rndne_f32_e32 v2, v1
@@ -26842,7 +26198,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7f800000, v1, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -26882,8 +26238,7 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26896,9 +26251,8 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26908,10 +26262,9 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -26922,11 +26275,10 @@ define bfloat @v_ceil_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ceil_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -26967,8 +26319,7 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26981,9 +26332,8 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -26993,10 +26343,9 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27007,11 +26356,10 @@ define bfloat @v_trunc_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27052,8 +26400,7 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27066,9 +26413,8 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27078,10 +26424,9 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27092,11 +26437,10 @@ define bfloat @v_rint_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27137,8 +26481,7 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27151,9 +26494,8 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27163,10 +26505,9 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27177,11 +26518,10 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27240,8 +26580,7 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27260,9 +26599,8 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27276,11 +26614,10 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX10-NEXT:    v_sub_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v2|, 0.5
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s4
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27297,13 +26634,12 @@ define bfloat @v_round_bf16(bfloat %a) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v2|, 0.5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, v2, v0
 ; GFX11-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -27343,8 +26679,7 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27357,9 +26692,8 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27369,10 +26703,9 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27383,11 +26716,10 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rndne_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27428,8 +26760,7 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27442,9 +26773,8 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27454,10 +26784,9 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27468,11 +26797,10 @@ define bfloat @v_floor_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_floor_f32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -27505,8 +26833,7 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27519,9 +26846,8 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX9-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -27531,10 +26857,9 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -27545,11 +26870,10 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_max_f32_e32 v0, v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -31058,9 +30382,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31072,9 +30395,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31084,9 +30406,8 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -31097,11 +30418,10 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v0, v0, 0, 16
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -31143,16 +30463,14 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31166,15 +30484,13 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31186,12 +30502,11 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -31205,16 +30520,15 @@ define <2 x bfloat> @v_sitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v1, v0, 0, 16
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -31263,26 +30577,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v4, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v4
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31294,25 +30604,22 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX9-NEXT:    v_add3_u32 v2, v2, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31325,24 +30632,23 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v1, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i16> %x to <3 x bfloat>
@@ -31393,31 +30699,27 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v5, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -31430,32 +30732,28 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31468,30 +30766,29 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31499,37 +30796,39 @@ define <4 x bfloat> @v_sitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_bfe_i32 v3, v0, 0, 16
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v0, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v7, v6, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_perm_b32 v0, v0, v3, 0x7060302
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v6, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v1, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i16> %x to <4 x bfloat>
@@ -31557,9 +30856,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31571,9 +30869,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -31583,9 +30880,8 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -31596,10 +30892,9 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -31636,16 +30931,14 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -31659,15 +30952,13 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31679,12 +30970,11 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -31698,13 +30988,12 @@ define <2 x bfloat> @v_sitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -31749,23 +31038,20 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -31782,21 +31068,18 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31809,24 +31092,23 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <3 x i32> %x to <3 x bfloat>
@@ -31869,31 +31151,27 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -31910,28 +31188,24 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -31944,30 +31218,29 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -31976,32 +31249,32 @@ define <4 x bfloat> @v_sitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = sitofp <4 x i32> %x to <4 x bfloat>
@@ -32063,8 +31336,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32087,9 +31359,8 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32100,7 +31371,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v3, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, -1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 32, v2
@@ -32112,7 +31382,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -32124,7 +31394,6 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_cls_i32_e32 v3, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, -1, v3
@@ -32141,7 +31410,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -32240,22 +31509,20 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX8-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -32285,21 +31552,19 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_min_u32_e32 v6, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32313,7 +31578,6 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_xor_b32_e32 v5, v2, v3
 ; GFX10-NEXT:    v_ffbh_i32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_i32_e32 v7, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, -1, v6
@@ -32336,9 +31600,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -32354,9 +31618,10 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    v_xor_b32_e32 v5, v2, v3
 ; GFX11-NEXT:    v_cls_i32_e32 v6, v1
 ; GFX11-NEXT:    v_cls_i32_e32 v7, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
 ; GFX11-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v6, -1, v6
 ; GFX11-NEXT:    v_add_nc_u32_e32 v7, -1, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -32385,9 +31650,9 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -32515,23 +31780,22 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX8-NEXT:    v_xor_b32_e32 v6, v2, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_ffbh_i32_e32 v5, v3
+; GFX8-NEXT:    v_ashrrev_i32_e32 v6, 31, v6
+; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, -1, v5
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 32, v6
+; GFX8-NEXT:    v_min_u32_e32 v5, v5, v6
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
-; GFX8-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX8-NEXT:    v_ffbh_i32_e32 v6, v3
-; GFX8-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
-; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, -1, v6
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 32, v7
-; GFX8-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
 ; GFX8-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
@@ -32539,17 +31803,15 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -32565,30 +31827,29 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
 ; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_xor_b32_e32 v7, v0, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_ffbh_i32_e32 v6, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, -1, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, 32, v7
+; GFX9-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
@@ -32597,21 +31858,19 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32638,7 +31897,6 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v11, -1, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_min_u32_e32 v8, v10, v8
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v6, v6, v7
 ; GFX10-NEXT:    v_min_u32_e32 v7, v11, v9
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
@@ -32660,13 +31918,13 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX10-NEXT:    v_ldexp_f32 v2, v2, v4
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
@@ -32820,10 +32078,9 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, -1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 32, v5
-; GFX8-NEXT:    v_min_u32_e32 v11, v4, v5
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX8-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
@@ -32836,7 +32093,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 32, v9
 ; GFX8-NEXT:    v_min_u32_e32 v8, v8, v9
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v11
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
@@ -32844,8 +32101,7 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
@@ -32858,22 +32114,20 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, -1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 32, v1
-; GFX8-NEXT:    v_min_u32_e32 v9, v0, v1
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX8-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
@@ -32905,34 +32159,32 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v5, 31, v5
 ; GFX9-NEXT:    v_add_u32_e32 v4, -1, v4
 ; GFX9-NEXT:    v_add_u32_e32 v5, 32, v5
-; GFX9-NEXT:    v_min_u32_e32 v11, v4, v5
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v10
-; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_xor_b32_e32 v8, v0, v1
+; GFX9-NEXT:    v_ffbh_i32_e32 v7, v1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, -1, v7
+; GFX9-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX9-NEXT:    v_min_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v9, v0, v1
-; GFX9-NEXT:    v_ffbh_i32_e32 v8, v1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v9, 31, v9
-; GFX9-NEXT:    v_add_u32_e32 v8, -1, v8
-; GFX9-NEXT:    v_add_u32_e32 v9, 32, v9
-; GFX9-NEXT:    v_min_u32_e32 v8, v8, v9
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v11
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v2, v3
@@ -32941,21 +32193,19 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, -1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v1, 32, v1
-; GFX9-NEXT:    v_min_u32_e32 v9, v0, v1
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX9-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -32989,7 +32239,6 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX10-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
 ; GFX10-NEXT:    v_min_u32_e32 v9, v12, v9
 ; GFX10-NEXT:    v_min_u32_e32 v11, v13, v14
@@ -33015,21 +32264,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX10-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc_lo
@@ -33065,16 +32314,15 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_lshlrev_b64 v[4:5], v8, v[4:5]
 ; GFX11-NEXT:    v_add_nc_u32_e32 v9, 32, v9
 ; GFX11-NEXT:    v_add_nc_u32_e32 v13, -1, v13
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], v10, v[6:7]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX11-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v9, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v11, v[2:3]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 1, v6
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 32, v8
@@ -33096,21 +32344,21 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
 ; GFX11-NEXT:    v_ldexp_f32 v3, v3, v4
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX11-NEXT:    v_add3_u32 v4, v7, v2, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v4, v6, v3, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v5, v7, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -33148,9 +32396,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33162,9 +32409,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33174,9 +32420,8 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -33187,11 +32432,10 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -33233,16 +32477,14 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33256,15 +32498,13 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33276,12 +32516,11 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -33295,16 +32534,15 @@ define <2 x bfloat> @v_uitofp_v2i16_to_v2bf16(<2 x i16> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33357,22 +32595,19 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX8-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33384,25 +32619,22 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
-; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX9-NEXT:    v_bfe_u32 v2, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v4
-; GFX9-NEXT:    v_add3_u32 v2, v2, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33415,17 +32647,16 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -33484,30 +32715,26 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v5, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v5
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -33520,32 +32747,28 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_cvt_f32_u32_sdwa v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX9-NEXT:    v_cvt_f32_u32_sdwa v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX9-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX9-NEXT:    v_bfe_u32 v3, v5, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v5
-; GFX9-NEXT:    v_add3_u32 v3, v3, v5, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
+; GFX9-NEXT:    v_bfe_u32 v3, v4, 16, 1
+; GFX9-NEXT:    v_add3_u32 v3, v3, v4, s4
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33558,23 +32781,22 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v8, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v10, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v7, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v8, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -33589,17 +32811,16 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v7, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33610,11 +32831,11 @@ define <4 x bfloat> @v_uitofp_v4i16_to_v4bf16(<4 x i16> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_bfe_u32 v8, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v9, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v10, v0, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_or_b32 v11, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v11, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v8, v8, v3, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33652,9 +32873,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v1
+; GFX8-NEXT:    v_or_b32_e32 v1, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33666,9 +32886,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -33678,9 +32897,8 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -33691,10 +32909,9 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -33731,16 +32948,14 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -33754,15 +32969,13 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33774,12 +32987,11 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -33793,13 +33005,12 @@ define <2 x bfloat> @v_uitofp_v2i32_to_v2bf16(<2 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -33844,23 +33055,20 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
 ; GFX8-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -33877,21 +33085,18 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -33904,24 +33109,23 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_alignbit_b32 v1, s4, v2, 16
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <3 x i32> %x to <3 x bfloat>
@@ -33964,31 +33168,27 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_bfe_u32 v5, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v0, 16, 1
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
 ; GFX8-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
@@ -34005,28 +33205,24 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v2, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v4, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34039,30 +33235,29 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, v3
-; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v2, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT:    v_and_or_b32 v9, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v11, v1, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34071,32 +33266,32 @@ define <4 x bfloat> @v_uitofp_v4i32_to_v4bf16(<4 x i32> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v2, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v4, v2, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v10, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v11, v1, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v4, v4, v2, 0x7fff
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v6, v3, s0, 0x400000
-; GFX11-NEXT:    v_add3_u32 v4, v10, v1, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v5, v7, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v8, v9, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc_lo
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i32> %x to <4 x bfloat>
@@ -34146,8 +33341,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34166,9 +33360,8 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v2
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34178,7 +33371,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
 ; GFX10-NEXT:    v_min_u32_e32 v0, 1, v0
@@ -34187,7 +33379,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX10-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -34198,7 +33390,6 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
@@ -34211,7 +33402,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -34284,22 +33475,20 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x7fff, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX8-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v7
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v6
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -34321,21 +33510,19 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX9-NEXT:    v_bfe_u32 v0, v4, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v5, v0, v4, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v6
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34347,7 +33534,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_ffbh_u32_e32 v4, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v5, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX10-NEXT:    v_min_u32_e32 v5, 32, v5
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
@@ -34364,9 +33550,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
@@ -34380,7 +33566,6 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v4, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v5, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_min_u32_e32 v4, 32, v4
 ; GFX11-NEXT:    v_min_u32_e32 v5, 32, v5
@@ -34404,9 +33589,9 @@ define <2 x bfloat> @v_uitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
@@ -34503,16 +33688,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v4
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_ffbh_u32_e32 v6, v3
-; GFX8-NEXT:    v_min_u32_e32 v6, 32, v6
-; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v6, v[2:3]
+; GFX8-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX8-NEXT:    v_min_u32_e32 v5, 32, v5
+; GFX8-NEXT:    v_lshlrev_b64 v[2:3], v5, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 32, v7
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v4
@@ -34522,17 +33706,15 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 32, v5
 ; GFX8-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX8-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v2
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v2
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -34545,44 +33727,41 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX9-NEXT:    v_ffbh_u32_e32 v6, v5
 ; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
-; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
+; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
+; GFX9-NEXT:    v_ffbh_u32_e32 v6, v1
+; GFX9-NEXT:    v_min_u32_e32 v6, 32, v6
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v6, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX9-NEXT:    v_sub_u32_e32 v5, 32, v6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v5
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v7, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v5, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v6, v0, v5, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v5
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v5
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v7
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v7
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34596,7 +33775,6 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ffbh_u32_e32 v6, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v8, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v7, v5
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v6, 32, v6
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v7, 32, v7
@@ -34620,13 +33798,13 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v7
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v7, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v2
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
@@ -34739,19 +33917,18 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s4, v4
 ; GFX8-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX8-NEXT:    v_min_u32_e32 v11, 32, v4
-; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
+; GFX8-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX8-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v8
 ; GFX8-NEXT:    v_min_u32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX8-NEXT:    v_ffbh_u32_e32 v8, v1
 ; GFX8-NEXT:    v_min_u32_e32 v8, 32, v8
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v10
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v11
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 32, v10
 ; GFX8-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_bfe_u32 v6, v4, 16, 1
@@ -34759,8 +33936,7 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 32, v8
@@ -34769,22 +33945,20 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v0
 ; GFX8-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX8-NEXT:    v_min_u32_e32 v9, 32, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX8-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX8-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX8-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v9
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 32, v8
 ; GFX8-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
@@ -34808,49 +33982,45 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX9-NEXT:    v_bfe_u32 v4, v8, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v9, v4, v8, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v4, v7
-; GFX9-NEXT:    v_min_u32_e32 v11, 32, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v11, v[6:7]
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
-; GFX9-NEXT:    v_ffbh_u32_e32 v8, v1
+; GFX9-NEXT:    v_min_u32_e32 v10, 32, v4
+; GFX9-NEXT:    v_lshlrev_b64 v[4:5], v10, v[6:7]
+; GFX9-NEXT:    v_ffbh_u32_e32 v7, v1
 ; GFX9-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX9-NEXT:    v_min_u32_e32 v8, 32, v8
+; GFX9-NEXT:    v_min_u32_e32 v7, 32, v7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v7, v[0:1]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v4
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v10
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v8
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v11
+; GFX9-NEXT:    v_sub_u32_e32 v6, 32, v10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX9-NEXT:    v_ldexp_f32 v4, v4, v6
 ; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v8
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v1, 32, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v6, v8, vcc
 ; GFX9-NEXT:    v_ldexp_f32 v6, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v0, v6, 16, 1
 ; GFX9-NEXT:    v_add3_u32 v7, v0, v6, s4
 ; GFX9-NEXT:    v_ffbh_u32_e32 v0, v3
-; GFX9-NEXT:    v_min_u32_e32 v9, 32, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v9, v[2:3]
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
+; GFX9-NEXT:    v_min_u32_e32 v8, 32, v0
+; GFX9-NEXT:    v_lshlrev_b64 v[0:1], v8, v[2:3]
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v6
 ; GFX9-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v8
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v9
+; GFX9-NEXT:    v_sub_u32_e32 v2, 32, v8
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -34865,7 +34035,6 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_ffbh_u32_e32 v10, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v11, v3
 ; GFX10-NEXT:    v_ffbh_u32_e32 v9, v7
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX10-NEXT:    v_min_u32_e32 v10, 32, v10
 ; GFX10-NEXT:    v_min_u32_e32 v11, 32, v11
@@ -34894,27 +34063,27 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX10-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v6
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_ldexp_f32 v4, v4, v9
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v1, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v9, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v1, s4, 0x400000
+; GFX10-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v1
 ; GFX10-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v4, s4, 0x400000
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -34925,9 +34094,10 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v10, v1
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v11, v3
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v9, v7
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v8, 32, v8
 ; GFX11-NEXT:    v_min_u32_e32 v10, 32, v10
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_min_u32_e32 v11, 32, v11
 ; GFX11-NEXT:    v_min_u32_e32 v9, 32, v9
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -34948,41 +34118,42 @@ define <4 x bfloat> @v_uitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v1, v3, v2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v5, 32, v10
-; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 32, v11
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, v4
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; GFX11-NEXT:    v_ldexp_f32 v2, v2, v8
+; GFX11-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_ldexp_f32 v2, v2, v8
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v5
-; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_ldexp_f32 v4, v4, v9
+; GFX11-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
-; GFX11-NEXT:    v_bfe_u32 v9, v1, 16, 1
+; GFX11-NEXT:    v_ldexp_f32 v4, v4, v9
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT:    v_add3_u32 v3, v9, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v1, s0, 0x400000
-; GFX11-NEXT:    v_and_or_b32 v9, v4, s0, 0x400000
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v4
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v7, v9, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v8, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v9, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_perm_b32 v1, v3, v2, 0x7060302
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <4 x i64> %x to <4 x bfloat>
@@ -40088,8 +39259,7 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40104,9 +39274,8 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40118,10 +39287,9 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v1, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
@@ -40134,11 +39302,10 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v0, v2, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v1, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v0, v0, v2, 0x7fff
@@ -40206,16 +39373,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v1, v2
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40234,16 +39399,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40259,14 +39422,13 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v3, v5, v4
 ; GFX10-NEXT:    v_fmac_f32_e32 v2, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v0, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_bfe_u32 v1, v2, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
@@ -40284,15 +39446,14 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_dual_fmac_f32 v2, v0, v1 :: v_dual_fmac_f32 v3, v5, v4
 ; GFX11-NEXT:    v_bfe_u32 v1, v2, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v0, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT:    v_and_or_b32 v5, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v2
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v2, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v0, v0, v3, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -40375,8 +39536,7 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40390,16 +39550,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40416,9 +39574,8 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40429,16 +39586,14 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40460,16 +39615,15 @@ define <3 x bfloat> @v_fma_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfloat>
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_fmac_f32_e32 v6, v8, v7
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v1, v6, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v0, v5, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v2, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v7, v4, s4, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v8, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v5
 ; GFX10-NEXT:    v_add3_u32 v0, v0, v5, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -40572,17 +39726,15 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX8-NEXT:    v_fma_f32 v1, v1, v3, v5
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40595,16 +39747,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX8-NEXT:    v_fma_f32 v0, v0, v2, v4
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -40625,16 +39775,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_fma_f32 v1, v1, v3, v5
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
@@ -40645,16 +39793,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_fma_f32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -40681,22 +39827,21 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v9, v8
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_or_b32_e32 v1, 0x400000, v6
 ; GFX10-NEXT:    v_fmac_f32_e32 v4, v0, v2
 ; GFX10-NEXT:    v_add3_u32 v0, v10, v6, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v1, v6, s4, 0x400000
 ; GFX10-NEXT:    v_bfe_u32 v2, v5, 16, 1
 ; GFX10-NEXT:    v_bfe_u32 v3, v7, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX10-NEXT:    v_bfe_u32 v8, v4, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v5, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v3, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v7
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX10-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v4, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc_lo
@@ -40717,14 +39862,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v5, v1, v3
 ; GFX11-NEXT:    v_dual_fmac_f32 v6, v8, v7 :: v_dual_lshlrev_b32 v7, 16, v4
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v1, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v1, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -40736,14 +39880,14 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11-NEXT:    v_fmac_f32_e32 v7, v9, v8
 ; GFX11-NEXT:    v_bfe_u32 v8, v4, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v0, v2, v5, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v9, v5, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v3, v7, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
 ; GFX11-NEXT:    v_add3_u32 v6, v8, v4, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX11-NEXT:    v_add3_u32 v2, v3, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v3, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
@@ -40803,8 +39947,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX8-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -40813,8 +39956,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40828,18 +39970,16 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v3
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -40850,10 +39990,9 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc_lo
@@ -40861,7 +40000,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v2, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
@@ -40873,11 +40012,10 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v3, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
@@ -40887,7 +40025,7 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v2, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -40958,8 +40096,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 0x7fff, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -40971,16 +40108,14 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, s4, v4
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -40989,8 +40124,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -41005,36 +40139,32 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v4, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_add3_u32 v4, v4, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v2
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41048,14 +40178,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v5, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX10-NEXT:    v_bfe_u32 v4, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT:    v_and_or_b32 v6, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v0, 0x7fff
@@ -41066,13 +40195,13 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc_lo
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v5, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -41086,19 +40215,17 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_mul_f32 v3, v4, v3 :: v_dual_and_b32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v1, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v1, v1, v3, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX11-NEXT:    v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v6, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
@@ -41109,7 +40236,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v4, v6 :: v_dual_add_f32 v1, v1, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f32_e32 v0, v0, v2
@@ -41117,7 +40244,7 @@ define <2 x bfloat> @v_fmuladd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfl
 ; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v5, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
@@ -41210,8 +40337,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x7fff, v3
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -41221,8 +40347,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -41231,8 +40356,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -41243,16 +40367,14 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX8-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41261,8 +40383,7 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -41278,54 +40399,48 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41343,41 +40458,40 @@ define <3 x bfloat> @v_fmuladd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b, <3 x bfl
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
 ; GFX10-NEXT:    v_mul_f32_e32 v3, v7, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v1, s4, 0x400000
-; GFX10-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v9, v3, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v6, v6, v3, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v10, v0, s4, 0x400000
-; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v8, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v0
+; GFX10-NEXT:    v_add3_u32 v8, v8, v0, 0x7fff
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v6, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v7, v9, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v10, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v8, v10, vcc_lo
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v5
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX10-NEXT:    v_bfe_u32 v3, v2, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX10-NEXT:    v_and_or_b32 v7, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v7, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_bfe_u32 v4, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v2, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v3, v7, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v0, 0x7fff
@@ -41492,8 +40606,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v6
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
@@ -41505,16 +40618,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s4, v7
-; GFX8-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -41523,8 +40634,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v3, v1, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v1
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s4, v3
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
@@ -41533,8 +40643,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v3
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -41545,16 +40654,14 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s4, v5
-; GFX8-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX8-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX8-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41563,8 +40670,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX8-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, 0x7fff, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -41581,72 +40687,64 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX9-NEXT:    v_mul_f32_e32 v6, v7, v6
 ; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v7, 16, v5
 ; GFX9-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v6
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX9-NEXT:    v_bfe_u32 v7, v6, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v6, s4
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v7, v8, vcc
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v5
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v1
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 16, v4
 ; GFX9-NEXT:    v_add_f32_e32 v3, v3, v5
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s4
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xff800000, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v5
+; GFX9-NEXT:    v_or_b32_e32 v5, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v4
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v4
+; GFX9-NEXT:    v_or_b32_e32 v4, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
@@ -41667,45 +40765,44 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GFX10-NEXT:    s_mov_b32 s4, 0xff800000
+; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX10-NEXT:    v_bfe_u32 v10, v6, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v3, v6, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v7, v9, v7
 ; GFX10-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT:    v_and_or_b32 v6, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX10-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v11, v0, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v10, v7, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
-; GFX10-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_and_or_b32 v12, v0, s4, 0x400000
 ; GFX10-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_add_f32_e32 v3, v3, v8
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 16, v4
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX10-NEXT:    v_and_or_b32 v5, v3, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v5, 0x400000, v3
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX10-NEXT:    v_bfe_u32 v6, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v9, v1, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
@@ -41713,10 +40810,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v6, v2, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX10-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX10-NEXT:    v_and_or_b32 v8, v0, s4, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
@@ -41736,7 +40833,6 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v8, 16, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 16, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_mul_f32 v6, v7, v6 :: v_dual_and_b32 v5, 0xffff0000, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v7, 16, v2
@@ -41744,20 +40840,20 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v10, v6, 16, 1
 ; GFX11-NEXT:    v_mul_f32_e32 v7, v9, v7
-; GFX11-NEXT:    v_and_or_b32 v3, v6, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v6
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v10, v10, v6, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX11-NEXT:    v_bfe_u32 v9, v7, 16, 1
 ; GFX11-NEXT:    v_dual_cndmask_b32 v3, v10, v3 :: v_dual_mul_f32 v0, v0, v2
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT:    v_and_or_b32 v10, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v7
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v7, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v11, v0, 16, 1
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v1, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v12, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v11, v11, v0, 0x7fff
 ; GFX11-NEXT:    v_dual_cndmask_b32 v1, v2, v6 :: v_dual_lshlrev_b32 v6, 16, v4
@@ -41769,7 +40865,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_dual_add_f32 v1, v1, v5 :: v_dual_and_b32 v2, 0xffff0000, v2
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v11, v12, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v9, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v1
 ; GFX11-NEXT:    v_add_f32_e32 v2, v2, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -41781,7 +40877,7 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_bfe_u32 v8, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v5, v3, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v3
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-NEXT:    v_add3_u32 v4, v7, v3, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
@@ -41789,10 +40885,10 @@ define <4 x bfloat> @v_fmuladd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfl
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
 ; GFX11-NEXT:    v_add3_u32 v4, v6, v1, 0x7fff
 ; GFX11-NEXT:    v_add3_u32 v5, v7, v2, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v6, v2, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v6, 0x400000, v2
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v2, v2
 ; GFX11-NEXT:    v_add3_u32 v7, v8, v0, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v8, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index d35871e..99b163d 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -790,8 +790,7 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX8-NEXT:    v_bfe_u32 v1, v0, 16, 1
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, 0x7fff, v1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
-; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
@@ -806,9 +805,8 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
 ; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
 ; GFX9-NEXT:    v_med3_f32 v0, v0, v1, v2
 ; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xff800000, v0
 ; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s4
-; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v2
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index 9142858..5889de7 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -1524,9 +1524,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1566,9 +1565,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1608,9 +1606,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX90A-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX90A-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1632,7 +1629,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s2, -4
 ; GFX10-NEXT:    s_mov_b32 s1, s3
@@ -1650,7 +1646,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s2, -4
@@ -1694,7 +1689,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1744,9 +1739,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX900-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX900-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX900-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX900-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX900-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX900-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX900-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1786,9 +1780,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX908-NEXT:    v_lshrrev_b32_sdwa v1, s5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX908-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX908-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX908-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX908-NEXT:    v_add3_u32 v3, v3, v1, s4
-; GFX908-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX908-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
 ; GFX908-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1828,9 +1821,8 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX90A-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX90A-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX90A-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xff800000, v1
+; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX90A-NEXT:    v_add3_u32 v2, v2, v1, s4
-; GFX90A-NEXT:    v_or_b32_e32 v4, 0x400000, v4
 ; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
 ; GFX90A-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1854,7 +1846,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s2, -4
 ; GFX10-NEXT:    s_mov_b32 s1, s3
@@ -1872,7 +1863,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, s2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX10-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
@@ -1895,7 +1886,6 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s5, 0xff800000
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s2, -4
@@ -1916,7 +1906,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_add_f32_e32 v1, 4.0, v1
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v4, v1, s5, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v1
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index 6a7fb71..ba946fe 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -912,10 +912,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -934,10 +933,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX11-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -956,10 +954,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX10-WF32-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX10-WF32-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX10-WF32-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF32-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX10-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-WF32-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -978,10 +975,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a,
   ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32767
   ; DAGISEL-GFX10-WF64-NEXT:   [[V_ADD3_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD3_U32_e64 killed [[V_BFE_U32_e64_]], [[V_ADD_F32_e64_]], killed [[S_MOV_B32_]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 4194304
-  ; DAGISEL-GFX10-WF64-NEXT:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 -8388608
-  ; DAGISEL-GFX10-WF64-NEXT:   [[V_AND_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_2]], killed [[S_MOV_B32_1]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_ADD_F32_e64_]], killed [[S_MOV_B32_1]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[V_CMP_U_F32_e64_:%[0-9]+]]:sreg_64_xexec = nofpexcept V_CMP_U_F32_e64 0, [[V_ADD_F32_e64_]], 0, [[V_ADD_F32_e64_]], 0, implicit $mode, implicit $exec
-  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_AND_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
+  ; DAGISEL-GFX10-WF64-NEXT:   [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, killed [[V_ADD3_U32_e64_]], 0, killed [[V_OR_B32_e64_]], killed [[V_CMP_U_F32_e64_]], implicit $exec
   ; DAGISEL-GFX10-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX10-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX10-WF64-NEXT:   FLAT_STORE_SHORT_D16_HI killed [[COPY2]], killed [[V_CNDMASK_B32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 48ae98f..5e76dfd 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1413,9 +1413,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; VI-NEXT:    v_bfe_u32 v6, v3, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v3
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v3
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v3
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, v4, v2
@@ -1451,9 +1450,8 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
 ; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v3
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s6
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -1560,9 +1558,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_add_f32_e32 v4, 4.0, v4
 ; VI-NEXT:    v_bfe_u32 v6, v4, 16, 1
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, v6, v4
-; VI-NEXT:    v_and_b32_e32 v7, 0xff800000, v4
 ; VI-NEXT:    v_add_u32_e32 v6, vcc, 0x7fff, v6
-; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v7
+; VI-NEXT:    v_or_b32_e32 v7, 0x400000, v4
 ; VI-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v6, v7, vcc
 ; VI-NEXT:    v_and_b32_e32 v5, v3, v2
@@ -1597,9 +1594,8 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
 ; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v4
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
index 672c93b..66c49ba 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
@@ -4259,65 +4259,57 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX9-NEXT:    v_fma_f32 v7, v8, v9, v7
 ; GFX9-NEXT:    v_fma_f32 v1, v8, v5, v1
-; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
 ; GFX9-NEXT:    v_fma_f32 v2, v12, v5, v2
 ; GFX9-NEXT:    v_bfe_u32 v5, v7, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v9, 0xff800000, v7
+; GFX9-NEXT:    v_fma_f32 v8, v12, v9, v11
+; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v7
 ; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xff800000, v1
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v7, s2
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v9
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v7, v7
+; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v1
 ; GFX9-NEXT:    v_bfe_u32 v13, v8, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v14, 0xff800000, v8
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v1, s2
-; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_or_b32_e32 v14, 0x400000, v8
 ; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v16, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v13, v13, v8, s2
-; GFX9-NEXT:    v_or_b32_e32 v14, 0x400000, v14
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v11, v12, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v8, v8
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
+; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v2
 ; GFX9-NEXT:    v_add3_u32 v15, v15, v2, s2
-; GFX9-NEXT:    v_or_b32_e32 v16, 0x400000, v16
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX9-NEXT:    v_lshlrev_b32_e32 v10, 16, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v15, v16, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
+; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX9-NEXT:    v_fma_f32 v1, v3, v10, v1
 ; GFX9-NEXT:    v_fma_f32 v3, v3, v6, v5
+; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
 ; GFX9-NEXT:    v_fma_f32 v2, v4, v10, v2
 ; GFX9-NEXT:    v_fma_f32 v4, v4, v6, v7
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xff800000, v1
+; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v1
 ; GFX9-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v8, 0xff800000, v3
 ; GFX9-NEXT:    v_add3_u32 v5, v5, v1, s2
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v6
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v3
 ; GFX9-NEXT:    v_bfe_u32 v9, v2, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v10, 0xff800000, v2
 ; GFX9-NEXT:    v_add3_u32 v7, v7, v3, s2
-; GFX9-NEXT:    v_or_b32_e32 v8, 0x400000, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v2
 ; GFX9-NEXT:    v_bfe_u32 v11, v4, 16, 1
-; GFX9-NEXT:    v_and_b32_e32 v12, 0xff800000, v4
 ; GFX9-NEXT:    v_add3_u32 v9, v9, v2, s2
-; GFX9-NEXT:    v_or_b32_e32 v10, 0x400000, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v8, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v4
 ; GFX9-NEXT:    v_add3_u32 v11, v11, v4, s2
-; GFX9-NEXT:    v_or_b32_e32 v12, 0x400000, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v10, vcc
 ; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v11, v12, vcc
@@ -4332,7 +4324,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x10
 ; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX10-NEXT:    s_mov_b32 s2, 0xff800000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v6, s[0:1]
@@ -4355,20 +4346,20 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_fmac_f32_e32 v11, v12, v9
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v12, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v8, v7, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v7
 ; GFX10-NEXT:    v_bfe_u32 v9, v0, 16, 1
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v7, v7
-; GFX10-NEXT:    v_and_or_b32 v12, v0, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX10-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v15, v1, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX10-NEXT:    v_and_or_b32 v16, v1, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v16, 0x400000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX10-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT:    v_and_or_b32 v14, v11, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v14, 0x400000, v11
 ; GFX10-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
@@ -4382,7 +4373,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_fmac_f32_e32 v0, v2, v10
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
-; GFX10-NEXT:    v_and_or_b32 v8, v4, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX10-NEXT:    v_bfe_u32 v2, v0, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v1, v3, v10
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
@@ -4390,14 +4381,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v9, v1, 16, 1
 ; GFX10-NEXT:    v_fmac_f32_e32 v7, v3, v5
-; GFX10-NEXT:    v_and_or_b32 v3, v0, s2, 0x400000
-; GFX10-NEXT:    v_and_or_b32 v10, v1, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v1
 ; GFX10-NEXT:    v_bfe_u32 v5, v4, 16, 1
 ; GFX10-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
 ; GFX10-NEXT:    v_bfe_u32 v11, v7, 16, 1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT:    v_and_or_b32 v12, v7, s2, 0x400000
+; GFX10-NEXT:    v_or_b32_e32 v12, 0x400000, v7
 ; GFX10-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX10-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v9, v10, vcc_lo
@@ -4416,7 +4407,6 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x10
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
-; GFX11-NEXT:    s_mov_b32 s0, 0xff800000
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x2
 ; GFX11-NEXT:    global_load_b64 v[0:1], v6, s[2:3]
@@ -4438,11 +4428,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffff0000, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_bfe_u32 v13, v11, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v14, v11, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v14, 0x400000, v11
 ; GFX11-NEXT:    v_bfe_u32 v15, v1, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_fmac_f32_e32 v7, v8, v9
-; GFX11-NEXT:    v_and_or_b32 v16, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v16, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v13, v13, v11, 0x7fff
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11-NEXT:    v_add3_u32 v15, v15, v1, 0x7fff
@@ -4450,11 +4440,11 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v8, v4
 ; GFX11-NEXT:    v_bfe_u32 v4, v7, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v8, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v7
 ; GFX11-NEXT:    v_bfe_u32 v9, v0, 16, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_add3_u32 v4, v4, v7, 0x7fff
-; GFX11-NEXT:    v_and_or_b32 v12, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v0
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v0, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc_lo
@@ -4466,7 +4456,7 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_dual_fmac_f32 v4, v2, v5 :: v_dual_cndmask_b32 v1, v15, v16
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v11, v11
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v8, v4, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v8, 0x400000, v4
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v7, v13, v14, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -4480,14 +4470,14 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl
 ; GFX11-NEXT:    v_add3_u32 v9, v9, v1, 0x7fff
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX11-NEXT:    v_bfe_u32 v11, v7, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v12, v7, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v12, 0x400000, v7
 ; GFX11-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_fmac_f32_e32 v0, v2, v10
-; GFX11-NEXT:    v_and_or_b32 v10, v1, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v1
 ; GFX11-NEXT:    v_add3_u32 v11, v11, v7, 0x7fff
 ; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT:    v_and_or_b32 v3, v0, s0, 0x400000
+; GFX11-NEXT:    v_or_b32_e32 v3, 0x400000, v0
 ; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
diff --git a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
index 9d7570b..d51c955 100644
--- a/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/Hexagon/atomicrmw-uinc-udec-wrap.ll
@@ -160,10 +160,8 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     p0 = cmp.gtu(r3:2,r5:4)
-; CHECK-NEXT:    }
-; CHECK-NEXT:    {
-; CHECK-NEXT:     r8 = mux(p0,r8,r1)
-; CHECK-NEXT:     r9 = mux(p0,r9,r1)
+; CHECK-NEXT:     if (!p0.new) r8 = add(r1,#0)
+; CHECK-NEXT:     if (!p0.new) r9 = add(r1,#0)
 ; CHECK-NEXT:    }
 ; CHECK-NEXT:    {
 ; CHECK-NEXT:     memd_locked(r0,p0) = r9:8
diff --git a/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
new file mode 100644
index 0000000..ef84043
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/bit-store-upper-sub-hi.mir
@@ -0,0 +1,21 @@
+# RUN: llc -march=hexagon -run-pass=hexagon-bit-simplify -o - %s | FileCheck %s
+
+# This test checks if the HexagonBitSimplify pass correctly replaces a
+# S2_storerh_io with a S2_storerf_io that stores the upper halfword
+# of a high subregister using appropriate subregister boundaries.
+
+# CHECK: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_hi
+# CHECK-NOT: S2_storerf_io %0, 28, %{{[0-9]+}}.isub_lo
+
+---
+name: test_store
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $r0
+    %0:intregs = COPY $r0
+    %1:doubleregs = IMPLICIT_DEF
+    %2:doubleregs = IMPLICIT_DEF
+    %3:doubleregs = S2_shuffoh %2, %1
+    S2_storerh_io %0, 28, %3.isub_hi
+...
diff --git a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
index 4e54aa4..7073c1a 100644
--- a/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
+++ b/llvm/test/CodeGen/Hexagon/isel/select-vec.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
 define <4 x i8> @f0(<4 x i8> %a0, <4 x i8> %a1, i32 %a2) #0 {
 ; CHECK-LABEL: f0:
diff --git a/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll
new file mode 100644
index 0000000..6f9e83c23
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/load-const-extend-opt.ll
@@ -0,0 +1,68 @@
+; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s
+; This test checks the case if there are more than 2 uses of a constan address, move the
+; value in to a register and replace all instances of constant with the register.
+; The GenMemAbsolute pass generates a absolute-set instruction if there are more
+; than 2 uses of this register.
+
+; CHECK: loadi32_3
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0)
+; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##441652)
+; CHECK-NOT: r{{[0-9]+}} = {emw(##441652)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##441652)
+
+define void @loadi32_3() #0 {
+entry:
+  %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %2 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: loadi32_2
+; CHECK-NOT: r{{[0-9]+}} = ##441652
+; CHECK: r{{[0-9]+}} = memw(##441652)
+; CHECK: r{{[0-9]+}} = memw(##441652)
+
+define void @loadi32_2() #0 {
+entry:
+  %0 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  %1 = load volatile i32, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: loadi32_abs_global_3
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK-NOT: r{{[0-9]+}} = memw(r{{[0-9]+}}+#0)
+; CHECK:r{{[0-9]+}} = memw(r[[REG:[0-9]+]]=##globalInt)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(r[[REG]]+#0)
+; CHECK-NOT: r{{[0-9]+}} = memw(##globalInt)
+
+@globalInt = external global i32, align 8
+define void @loadi32_abs_global_3() #0 {
+entry:
+  %0 = load volatile i32, ptr @globalInt, align 4
+  %1 = load volatile i32, ptr @globalInt, align 4
+  %2 = load volatile i32, ptr @globalInt, align 4
+  ret void
+}
+
+; CHECK: loadi32_abs_global_2
+; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt
+; CHECK:r{{[0-9]+}} = memw(##globalInt)
+; CHECK:r{{[0-9]+}} = memw(##globalInt)
+
+define void @loadi32_abs_global_2() #0 {
+entry:
+  %0 = load volatile i32, ptr @globalInt, align 4
+  %1 = load volatile i32, ptr @globalInt, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
new file mode 100644
index 0000000..3788dc3
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
@@ -0,0 +1,413 @@
+#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+
+# Test that we do not generate two post-increment vector load/store
+# in the loop.
+# CHECK: J2_loop0r
+# CHECK: V6_vS32b_pi
+# CHECK-NOT: = V6_vL32b_pi
+# CHECK: V6_vL32b_ai
+# CHECK: V6_vL32b_ai
+# CHECK: V6_vS32b_ai
+# CHECK: ENDLOOP0
+
+--- |
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+  declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+  declare void @llvm.assume(i1 noundef) #2
+
+  ; Function Attrs: noinline nounwind
+  define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 {
+  entry:
+    %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9)
+    %12 = shl i32 %11, 7
+    %mul16.i = mul nsw i32 %12, %1
+    %add.i = add nsw i32 %1, 1
+    %mul17.i = mul nsw i32 %add.i, %12
+    %cmp184.i = icmp slt i32 %mul16.i, %mul17.i
+    br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i
+
+  for.body.lr.ph.i:                                 ; preds = %entry
+    %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> <i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576>, <32 x i32> <i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144>) #5
+    %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5
+    %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5
+    %cgep = getelementptr i8, ptr %2, i32 %mul16.i
+    %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i
+    %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i
+    br label %for.body.i
+
+  for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
+    %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ]
+    %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ]
+    %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ]
+    %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ]
+    %16 = load <32 x i32>, ptr %lsr.iv6, align 128
+    %17 = load <32 x i32>, ptr %lsr.iv3, align 128
+    %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5
+    %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5
+    %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5
+    %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5
+    %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5
+    %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5
+    %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5
+    %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5
+    %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5
+    %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5
+    %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5
+    %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5
+    %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5
+    %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5
+    store <32 x i32> %31, ptr %lsr.iv, align 128
+    %add19.i = add nsw i32 %elemIdx.05.i, 128
+    %cmp18.i = icmp slt i32 %add19.i, %mul17.i
+    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128
+    %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128
+    %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128
+    br i1 %cmp18.i, label %for.body.i, label %for.end.i
+
+  for.end.i:                                        ; preds = %for.body.i, %entry
+    ret void
+  }
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4
+
+  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+  declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0
+
+  attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+  attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) }
+  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+  attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+  attributes #5 = { nounwind }
+
+...
+---
+name:            blah
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: hvxwr, preferred-register: '' }
+  - { id: 3, class: hvxwr, preferred-register: '' }
+  - { id: 4, class: hvxvr, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+  - { id: 6, class: intregs, preferred-register: '' }
+  - { id: 7, class: intregs, preferred-register: '' }
+  - { id: 8, class: intregs, preferred-register: '' }
+  - { id: 9, class: intregs, preferred-register: '' }
+  - { id: 10, class: intregs, preferred-register: '' }
+  - { id: 11, class: intregs, preferred-register: '' }
+  - { id: 12, class: intregs, preferred-register: '' }
+  - { id: 13, class: intregs, preferred-register: '' }
+  - { id: 14, class: intregs, preferred-register: '' }
+  - { id: 15, class: intregs, preferred-register: '' }
+  - { id: 16, class: intregs, preferred-register: '' }
+  - { id: 17, class: intregs, preferred-register: '' }
+  - { id: 18, class: intregs, preferred-register: '' }
+  - { id: 19, class: intregs, preferred-register: '' }
+  - { id: 20, class: intregs, preferred-register: '' }
+  - { id: 21, class: intregs, preferred-register: '' }
+  - { id: 22, class: intregs, preferred-register: '' }
+  - { id: 23, class: intregs, preferred-register: '' }
+  - { id: 24, class: intregs, preferred-register: '' }
+  - { id: 25, class: predregs, preferred-register: '' }
+  - { id: 26, class: predregs, preferred-register: '' }
+  - { id: 27, class: hvxvr, preferred-register: '' }
+  - { id: 28, class: intregs, preferred-register: '' }
+  - { id: 29, class: hvxvr, preferred-register: '' }
+  - { id: 30, class: intregs, preferred-register: '' }
+  - { id: 31, class: hvxvr, preferred-register: '' }
+  - { id: 32, class: intregs, preferred-register: '' }
+  - { id: 33, class: hvxvr, preferred-register: '' }
+  - { id: 34, class: hvxvr, preferred-register: '' }
+  - { id: 35, class: hvxwr, preferred-register: '' }
+  - { id: 36, class: hvxwr, preferred-register: '' }
+  - { id: 37, class: hvxwr, preferred-register: '' }
+  - { id: 38, class: hvxvr, preferred-register: '' }
+  - { id: 39, class: hvxvr, preferred-register: '' }
+  - { id: 40, class: intregs, preferred-register: '' }
+  - { id: 41, class: hvxvr, preferred-register: '' }
+  - { id: 42, class: hvxvr, preferred-register: '' }
+  - { id: 43, class: hvxvr, preferred-register: '' }
+  - { id: 44, class: hvxvr, preferred-register: '' }
+  - { id: 45, class: hvxvr, preferred-register: '' }
+  - { id: 46, class: hvxvr, preferred-register: '' }
+  - { id: 47, class: hvxvr, preferred-register: '' }
+  - { id: 48, class: hvxvr, preferred-register: '' }
+  - { id: 49, class: intregslow8, preferred-register: '' }
+  - { id: 50, class: hvxvr, preferred-register: '' }
+  - { id: 51, class: predregs, preferred-register: '' }
+  - { id: 52, class: intregs, preferred-register: '' }
+  - { id: 53, class: intregs, preferred-register: '' }
+  - { id: 54, class: intregs, preferred-register: '' }
+  - { id: 55, class: intregs, preferred-register: '' }
+  - { id: 56, class: intregs, preferred-register: '' }
+  - { id: 57, class: intregs, preferred-register: '' }
+  - { id: 58, class: intregs, preferred-register: '' }
+  - { id: 59, class: intregs, preferred-register: '' }
+  - { id: 60, class: intregs, preferred-register: '' }
+  - { id: 61, class: hvxvr, preferred-register: '' }
+  - { id: 62, class: intregs, preferred-register: '' }
+  - { id: 63, class: hvxvr, preferred-register: '' }
+  - { id: 64, class: intregs, preferred-register: '' }
+  - { id: 65, class: hvxwr, preferred-register: '' }
+  - { id: 66, class: hvxwr, preferred-register: '' }
+  - { id: 67, class: hvxwr, preferred-register: '' }
+  - { id: 68, class: hvxvr, preferred-register: '' }
+  - { id: 69, class: hvxvr, preferred-register: '' }
+  - { id: 70, class: hvxvr, preferred-register: '' }
+  - { id: 71, class: hvxvr, preferred-register: '' }
+  - { id: 72, class: hvxvr, preferred-register: '' }
+  - { id: 73, class: hvxvr, preferred-register: '' }
+  - { id: 74, class: hvxvr, preferred-register: '' }
+  - { id: 75, class: intregs, preferred-register: '' }
+  - { id: 76, class: intregs, preferred-register: '' }
+  - { id: 77, class: intregs, preferred-register: '' }
+  - { id: 78, class: intregs, preferred-register: '' }
+  - { id: 79, class: hvxvr, preferred-register: '' }
+  - { id: 80, class: intregs, preferred-register: '' }
+  - { id: 81, class: hvxvr, preferred-register: '' }
+  - { id: 82, class: intregs, preferred-register: '' }
+  - { id: 83, class: hvxwr, preferred-register: '' }
+  - { id: 84, class: hvxwr, preferred-register: '' }
+  - { id: 85, class: hvxwr, preferred-register: '' }
+  - { id: 86, class: hvxvr, preferred-register: '' }
+  - { id: 87, class: hvxvr, preferred-register: '' }
+  - { id: 88, class: hvxvr, preferred-register: '' }
+  - { id: 89, class: hvxvr, preferred-register: '' }
+  - { id: 90, class: hvxvr, preferred-register: '' }
+  - { id: 91, class: hvxvr, preferred-register: '' }
+  - { id: 92, class: hvxvr, preferred-register: '' }
+  - { id: 93, class: intregs, preferred-register: '' }
+  - { id: 94, class: intregs, preferred-register: '' }
+  - { id: 95, class: intregs, preferred-register: '' }
+  - { id: 96, class: intregs, preferred-register: '' }
+  - { id: 97, class: predregs, preferred-register: '' }
+  - { id: 98, class: predregs, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%16' }
+  - { reg: '$r1', virtual-reg: '%17' }
+  - { reg: '$r2', virtual-reg: '%18' }
+  - { reg: '$r3', virtual-reg: '%19' }
+  - { reg: '$r4', virtual-reg: '%20' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:
+  - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
+      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+    liveins: $r0, $r1, $r2, $r3, $r4
+
+    %20:intregs = COPY $r4
+    %19:intregs = COPY $r3
+    %18:intregs = COPY $r2
+    %17:intregs = COPY $r1
+    %16:intregs = COPY $r0
+    %22:intregs = S2_extractu %16, 23, 9
+    %23:intregs = S2_asl_i_r %22, 7
+    %0:intregs = nsw M2_mpyi %23, %17
+    %24:intregs = nsw A2_addi %17, 1
+    %1:intregs = nsw M2_mpyi %24, %23
+    %25:predregs = C2_cmpgt %1, %0
+    J2_jumpf %25, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+
+  bb.1.for.body.lr.ph.i:
+    successors: %bb.4(0x40000000), %bb.6(0x40000000)
+
+    %28:intregs = A2_tfrsi 269488144
+    %27:hvxvr = V6_lvsplatw %28
+    %30:intregs = A2_tfrsi 1077952576
+    %29:hvxvr = V6_lvsplatw %30
+    %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo
+    %31:hvxvr = V6_vd0
+    %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo
+    %32:intregs = A2_tfrsi 32
+    %4:hvxvr = V6_lvsplath %32
+    %5:intregs = A2_add %18, %0
+    %6:intregs = A2_add %20, %0
+    %7:intregs = A2_add %19, %0
+    %40:intregs = A2_tfrsi 7
+    %49:intregslow8 = A2_tfrsi 4
+    %52:intregs = A2_sub %1, %0
+    %53:intregs = A2_addi %52, 127
+    %54:intregs = S2_lsr_i_r %53, 7
+    %55:intregs = COPY %54
+    %56:intregs = S2_lsr_i_r %55, 1
+    %57:intregs = A2_andir %55, 1
+    %97:predregs = C2_cmpgtui %56, 0
+    J2_jumpf %97, %bb.6, implicit-def $pc
+    J2_jump %bb.4, implicit-def $pc
+
+  bb.4:
+    successors: %bb.5(0x80000000)
+
+    J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+    J2_jump %bb.5, implicit-def $pc
+
+  bb.5:
+    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+
+    %58:intregs = PHI %7, %bb.4, %80, %bb.5
+    %59:intregs = PHI %6, %bb.4, %82, %bb.5
+    %60:intregs = PHI %5, %bb.4, %93, %bb.5
+    %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6)
+    %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3)
+    %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo
+    %66:hvxwr = V6_vmpabusv %2, %65
+    %67:hvxwr = V6_vmpabusv %3, %65
+    %68:hvxvr = V6_vasrh %67.vsub_hi, %40
+    %69:hvxvr = V6_vavgh %66.vsub_hi, %68
+    %70:hvxvr = V6_vasrh %67.vsub_lo, %40
+    %71:hvxvr = V6_vavgh %66.vsub_lo, %70
+    %72:hvxvr = V6_vaddhsat %69, %4
+    %73:hvxvr = V6_vaddhsat %71, %4
+    %74:hvxvr = V6_vasrhbsat %72, %73, %49
+    %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv)
+    %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128)
+    %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128)
+    %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo
+    %84:hvxwr = V6_vmpabusv %2, %83
+    %85:hvxwr = V6_vmpabusv %3, %83
+    %86:hvxvr = V6_vasrh %85.vsub_hi, %40
+    %87:hvxvr = V6_vavgh %84.vsub_hi, %86
+    %88:hvxvr = V6_vasrh %85.vsub_lo, %40
+    %89:hvxvr = V6_vavgh %84.vsub_lo, %88
+    %90:hvxvr = V6_vaddhsat %87, %4
+    %91:hvxvr = V6_vaddhsat %89, %4
+    %92:hvxvr = V6_vasrhbsat %90, %91, %49
+    %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128)
+    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.6, implicit-def $pc
+
+  bb.6:
+    successors: %bb.7(0x40000000), %bb.8(0x40000000)
+
+    %94:intregs = PHI %7, %bb.1, %80, %bb.5
+    %95:intregs = PHI %6, %bb.1, %82, %bb.5
+    %96:intregs = PHI %5, %bb.1, %93, %bb.5
+    %98:predregs = C2_cmpgtui %57, 0
+    J2_jumpf %98, %bb.8, implicit-def $pc
+    J2_jump %bb.7, implicit-def $pc
+
+  bb.7:
+    successors: %bb.2(0x80000000)
+
+    J2_jump %bb.2, implicit-def $pc
+
+  bb.2.for.body.i (machine-block-address-taken):
+    successors: %bb.8(0x04000000)
+
+    %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6)
+    %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3)
+    %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo
+    %36:hvxwr = V6_vmpabusv %2, %35
+    %37:hvxwr = V6_vmpabusv %3, %35
+    %41:hvxvr = V6_vasrh %37.vsub_hi, %40
+    %42:hvxvr = V6_vavgh %36.vsub_hi, %41
+    %45:hvxvr = V6_vasrh %37.vsub_lo, %40
+    %46:hvxvr = V6_vavgh %36.vsub_lo, %45
+    %47:hvxvr = V6_vaddhsat %42, %4
+    %48:hvxvr = V6_vaddhsat %46, %4
+    %50:hvxvr = V6_vasrhbsat %47, %48, %49
+    %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv)
+    J2_jump %bb.8, implicit-def $pc
+
+  bb.8:
+    successors: %bb.3(0x80000000)
+
+    J2_jump %bb.3, implicit-def $pc
+
+  bb.3.for.end.i:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
new file mode 100644
index 0000000..3e3f51a
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
@@ -0,0 +1,168 @@
+#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+
+# Test that we convert a post-inc load and store to a regular load and post-inc
+# store.
+# CHECK: J2_loop0r
+# CHECK-NOT: = L2_loadruh_pi
+# CHECK: L2_loadruh_io
+# CHECK: S2_storerh_pi
+# CHECK: ENDLOOP0
+
+--- |
+  ; Function Attrs: nofree norecurse nounwind
+  define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 {
+  bb:
+    %icmp = icmp eq i32 %arg, 0
+    br i1 %icmp, label %bb13, label %bb3
+
+  bb3:                                              ; preds = %bb, %bb10
+    %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ]
+    %mul = mul i32 %phi, %arg
+    %cgep = getelementptr i16, ptr %arg1, i32 %mul
+    br label %bb4
+
+  bb4:                                              ; preds = %bb4, %bb3
+    %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ]
+    %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ]
+    %load = load i16, ptr %phi5, align 2
+    %add = add i16 %load, %arg2
+    store i16 %add, ptr %phi5, align 2
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %icmp8 = icmp eq i32 %lsr.iv.next, 0
+    %cgep1 = getelementptr i16, ptr %phi5, i32 1
+    br i1 %icmp8, label %bb10, label %bb4
+
+  bb10:                                             ; preds = %bb4
+    %add11 = add nuw i32 %phi, 1
+    %icmp12 = icmp eq i32 %add11, %arg
+    br i1 %icmp12, label %bb13, label %bb3
+
+  bb13:                                             ; preds = %bb10, %bb
+    ret void
+  }
+
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            blam
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: intregs, preferred-register: '' }
+  - { id: 1, class: intregs, preferred-register: '' }
+  - { id: 2, class: intregs, preferred-register: '' }
+  - { id: 3, class: intregs, preferred-register: '' }
+  - { id: 4, class: intregs, preferred-register: '' }
+  - { id: 5, class: intregs, preferred-register: '' }
+  - { id: 6, class: intregs, preferred-register: '' }
+  - { id: 7, class: intregs, preferred-register: '' }
+  - { id: 8, class: intregs, preferred-register: '' }
+  - { id: 9, class: intregs, preferred-register: '' }
+  - { id: 10, class: intregs, preferred-register: '' }
+  - { id: 11, class: intregs, preferred-register: '' }
+  - { id: 12, class: predregs, preferred-register: '' }
+  - { id: 13, class: intregs, preferred-register: '' }
+  - { id: 14, class: intregs, preferred-register: '' }
+  - { id: 15, class: intregs, preferred-register: '' }
+  - { id: 16, class: predregs, preferred-register: '' }
+  - { id: 17, class: predregs, preferred-register: '' }
+  - { id: 18, class: predregs, preferred-register: '' }
+  - { id: 19, class: predregs, preferred-register: '' }
+  - { id: 20, class: intregs, preferred-register: '' }
+  - { id: 21, class: intregs, preferred-register: '' }
+liveins:
+  - { reg: '$r0', virtual-reg: '%7' }
+  - { reg: '$r1', virtual-reg: '%8' }
+  - { reg: '$r2', virtual-reg: '%9' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.bb:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %21:intregs = COPY %7
+    %20:intregs = COPY %7
+    %12:predregs = C2_cmpeqi %7, 0
+    J2_jumpt %12, %bb.4, implicit-def $pc
+
+  bb.5:
+    successors: %bb.1(0x80000000)
+
+    %11:intregs = A2_tfrsi 0
+    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+
+  bb.1.bb3 (machine-block-address-taken):
+    successors: %bb.2(0x80000000)
+
+    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+    %13:intregs = M2_mpyi %0, %7
+    %1:intregs = S2_addasl_rrri %8, %13, 1
+    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+  bb.2.bb4 (machine-block-address-taken):
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+    %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5)
+    %15:intregs = A2_add %14, %9
+    %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5)
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.3.bb10:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %6:intregs = nuw A2_addi %0, 1
+    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+    J2_jump %bb.4, implicit-def dead $pc
+
+  bb.4.bb13:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
new file mode 100644
index 0000000..e220534
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
@@ -0,0 +1,58 @@
+# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null
+# REQUIRES: asserts
+# Test that we do not hit unreachable code dealt with L4_ior_memoph_io.
+
+...
+---
+name:            foo
+alignment:       4
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+    liveins: $r0, $r1, $r2
+
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %21:intregs = COPY %7
+    %20:intregs = COPY %7
+    %12:predregs = C2_cmpeqi %7, 0
+    J2_jumpt %12, %bb.4, implicit-def $pc
+
+  bb.5:
+    successors: %bb.1(0x80000000)
+
+    %11:intregs = A2_tfrsi 0
+    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+    %13:intregs = M2_mpyi %0, %7
+    %1:intregs = S2_addasl_rrri %8, %13, 1
+    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+  bb.2:
+    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+    %14:intregs = L2_loadruh_io %3, 0
+    L4_ior_memoph_io %3:intregs, 0, 21
+    %15:intregs = A2_add %14, %9
+    %5:intregs = S2_storerh_pi %3, 2, %15
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+
+  bb.3:
+    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+
+    %6:intregs = nuw A2_addi %0, 1
+    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+    J2_jump %bb.4, implicit-def dead $pc
+
+  bb.4:
+    PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
new file mode 100644
index 0000000..27d653c9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
@@ -0,0 +1,19 @@
+# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s
+# Check that this doesn't crash.
+# CHECK: Y2_dcfetchbo
+
+name: fred
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1
+    %0:intregs = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.1
+
+    %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1
+    Y2_dcfetchbo %1:intregs, 0
+    %2:intregs = A2_addi %1:intregs, 1
+    J2_jump %bb.1, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/reg-by-name.ll b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
index 4abea83..cc8807e 100644
--- a/llvm/test/CodeGen/Hexagon/reg-by-name.ll
+++ b/llvm/test/CodeGen/Hexagon/reg-by-name.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-tfr-cleanup=0 < %s | FileCheck %s
 
 target triple = "hexagon"
 
@@ -647,7 +647,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv62" }
+attributes #0 = { noinline nounwind optnone "target-cpu"="hexagonv73" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 
diff --git a/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll
new file mode 100644
index 0000000..dccf176
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/store-const-extend-opt.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=hexagon -O3 -hexagon-small-data-threshold=0 < %s | FileCheck %s
+; This test checks the case if there are more than 2 uses of a constan address, move the
+; value in to a register and replace all instances of constant with the register.
+; The GenMemAbsolute pass generates a absolute-set instruction if there are more
+; than 2 uses of this register.
+
+; CHECK: storetrunci32_3
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}}
+; CHECK:memw(r[[REG:[0-9]+]]=##441652) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##441652) = r{{[0-9]+}}
+
+define void @storetrunci32_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %kkr, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: storetrunci32_2
+; CHECK-NOT: r{{[0-9]+}} = ##441652
+; CHECK: memw(##441652) = r{{[0-9]+}}
+; CHECK: memw(##441652) = r{{[0-9]+}}
+
+define void @storetrunci32_2(i64 %descr_addr, i32 %rpm_or_sys) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr inttoptr (i32 441652 to ptr), align 4
+  store volatile i32 %rpm_or_sys, ptr inttoptr (i32 441652 to ptr), align 4
+  ret void
+}
+
+; CHECK: storetrunci32_abs_global_3
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK-NOT: memw(r{{[0-9]+}}+#0) = r{{[0-9]+}}
+; CHECK:memw(r[[REG:[0-9]+]]=##globalInt) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(r[[REG]]+#0) = r{{[0-9]+}}
+; CHECK-NOT: memw(##globalInt) = r{{[0-9]+}}
+
+@globalInt = external global i32, align 8
+define void @storetrunci32_abs_global_3(i64 %descr_addr, i32 %rpm_or_sys, i32 %kkr) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr @globalInt, align 4
+  store volatile i32 %rpm_or_sys, ptr @globalInt, align 4
+  store volatile i32 %kkr, ptr @globalInt, align 4
+  ret void
+}
+
+; CHECK: storetrunci32_abs_global_2
+; CHECK-NOT:r[[REG:[0-9]+]] = ##globalInt
+; CHECK:memw(##globalInt) = r{{[0-9]+}}
+; CHECK:memw(##globalInt) = r{{[0-9]+}}
+
+define void @storetrunci32_abs_global_2(i64 %descr_addr, i32 %rpm_or_sys) #0 {
+entry:
+  %conv = trunc i64 %descr_addr to i32
+  store volatile i32 %conv, ptr @globalInt, align 4
+  store volatile i32 %rpm_or_sys, ptr @globalInt, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
new file mode 100644
index 0000000..cebba94
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/tfr-slotindex.ll
@@ -0,0 +1,26 @@
+; Check that after tfr-cleanup COPY to $r0 is converted to tfrsi instruction
+; The tfrst instruction must use the same slot index as the COPY instruction
+; to avoid breaking live interval information.
+; Check that there is no machine verifier crash
+
+; RUN: llc -stop-after=tfr-cleanup -verify-machineinstrs %s -o - | FileCheck %s
+
+; CHECK: $r0 = A2_tfrsi 34767
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nounwind optsize
+define dso_local i32 @foo() local_unnamed_addr #0 {
+entry:
+  call void @bar(i32 34767) #1
+  call void @baz(i32 34767) #1
+  ret i32 15
+}
+
+declare void @bar(i32) local_unnamed_addr
+
+declare void @baz(i32) local_unnamed_addr
+
+attributes #0 = { nounwind optsize "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" }
+attributes #1 = { noduplicate nomerge nounwind }
diff --git a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
new file mode 100644
index 0000000..fca42d5
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
@@ -0,0 +1,32 @@
+# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s
+# REQUIRES: asserts
+
+# Check that this doesn't crash:
+# CHECK: L2_loadbsw4_io
+
+---
+name: fred
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0', virtual-reg: '%0' }
+body: |
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $r0
+
+    %0:intregs = COPY $r0
+    %1:intregs = A2_tfrsi 240
+    %2:doubleregs = IMPLICIT_DEF
+    %3:doubleregs = IMPLICIT_DEF
+
+  bb.1:
+    successors: %bb.1(0x80000000)
+
+    %4:intregs = PHI %1, %bb.0, %5, %bb.1
+    %6:doubleregs = L2_loadbsw4_io %4, 0
+    %7:doubleregs = M2_vrmac_s0 %2, %6, %3
+    S2_storeri_io %0, 0, %7.isub_lo
+    %5:intregs = nuw A2_addi %4, 256
+    J2_jump %bb.1, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
new file mode 100644
index 0000000..0945300
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-memcpy.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s
+
+%Box = type [6 x i64]
+
+define void @box(ptr noalias nocapture noundef writeonly sret(%Box) align 16 dereferenceable(48) %b, i64 %i) {
+; CHECK-LABEL: box:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi.d $sp, $sp, -96
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    slli.d $a2, $a1, 5
+; CHECK-NEXT:    alsl.d $a1, $a1, $a2, 4
+; CHECK-NEXT:    addi.d $a2, $sp, 0
+; CHECK-NEXT:    add.d $a3, $a2, $a1
+; CHECK-NEXT:    ldx.d $a1, $a1, $a2
+; CHECK-NEXT:    st.d $a1, $a0, 0
+; CHECK-NEXT:    ld.d $a1, $a3, 40
+; CHECK-NEXT:    st.d $a1, $a0, 40
+; CHECK-NEXT:    ld.d $a1, $a3, 32
+; CHECK-NEXT:    st.d $a1, $a0, 32
+; CHECK-NEXT:    ld.d $a1, $a3, 24
+; CHECK-NEXT:    st.d $a1, $a0, 24
+; CHECK-NEXT:    ld.d $a1, $a3, 16
+; CHECK-NEXT:    st.d $a1, $a0, 16
+; CHECK-NEXT:    ori $a1, $a3, 8
+; CHECK-NEXT:    ld.d $a1, $a1, 0
+; CHECK-NEXT:    st.d $a1, $a0, 8
+; CHECK-NEXT:    addi.d $sp, $sp, 96
+; CHECK-NEXT:    ret
+  %1 = alloca [2 x %Box], align 16
+  %2 = getelementptr inbounds [2 x %Box], ptr %1, i64 0, i64 %i
+  call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(48) %b, ptr noundef nonnull align 16 dereferenceable(48) %2, i64 48, i1 false)
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/param-load-store.ll b/llvm/test/CodeGen/NVPTX/param-load-store.ll
index c14dc88..a29d4e1 100644
--- a/llvm/test/CodeGen/NVPTX/param-load-store.ll
+++ b/llvm/test/CodeGen/NVPTX/param-load-store.ll
@@ -1135,31 +1135,86 @@ define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
 ; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
-; --- TODO
-; --- Unaligned parameter store/ return value load is broken in both nvcc
-; --- and llvm and needs to be fixed.
 ; CHECK:        .param .align 1 .b8 param0[25];
-; CHECK-DAG:        st.param.b32    [param0+0],
-; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+0],
+; CHECK-DAG:        st.param.b8     [param0+1],
+; CHECK-DAG:        st.param.b8     [param0+2],
+; CHECK-DAG:        st.param.b8     [param0+3],
+; CHECK-DAG:        st.param.b8     [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+5],
+; CHECK-DAG:        st.param.b8     [param0+6],
+; CHECK-DAG:        st.param.b8     [param0+7],
 ; CHECK-DAG:        st.param.b8     [param0+8],
-; CHECK-DAG:        st.param.b32    [param0+9],
-; CHECK-DAG:        st.param.b32    [param0+13],
-; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+9],
+; CHECK-DAG:        st.param.b8     [param0+10],
+; CHECK-DAG:        st.param.b8     [param0+11],
+; CHECK-DAG:        st.param.b8     [param0+12],
+; CHECK-DAG:        st.param.b8     [param0+13],
+; CHECK-DAG:        st.param.b8     [param0+14],
+; CHECK-DAG:        st.param.b8     [param0+15],
+; CHECK-DAG:        st.param.b8     [param0+16],
+; CHECK-DAG:        st.param.b8     [param0+17],
+; CHECK-DAG:        st.param.b8     [param0+18],
+; CHECK-DAG:        st.param.b8     [param0+19],
+; CHECK-DAG:        st.param.b8     [param0+20],
+; CHECK-DAG:        st.param.b8     [param0+21],
+; CHECK-DAG:        st.param.b8     [param0+22],
+; CHECK-DAG:        st.param.b8     [param0+23],
+; CHECK-DAG:        st.param.b8     [param0+24],
 ; CHECK:            .param .align 1 .b8 retval0[25];
 ; CHECK:            call.uni (retval0),
 ; CHECK-NEXT:       test_s_i1i32x4p,
-; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
-; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
-; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
-; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
-; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
-; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
-; CHECK-DAG:        st.param.b32    [func_retval0+0],
-; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+0];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+1];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+2];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+3];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+4];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+5];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+6];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+7];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+8];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+9];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+10];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+11];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+12];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+13];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+14];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+15];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+16];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+17];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+18];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+19];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+20];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+21];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+22];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+23];
+; CHECK-DAG:        ld.param.b8 %rs{{[0-9]+}}, [retval0+24];
+; CHECK:            } // callseq
+; CHECK-DAG:        st.param.b8     [func_retval0+0],
+; CHECK-DAG:        st.param.b8     [func_retval0+1],
+; CHECK-DAG:        st.param.b8     [func_retval0+2],
+; CHECK-DAG:        st.param.b8     [func_retval0+3],
+; CHECK-DAG:        st.param.b8     [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+5],
+; CHECK-DAG:        st.param.b8     [func_retval0+6],
+; CHECK-DAG:        st.param.b8     [func_retval0+7],
 ; CHECK-DAG:        st.param.b8     [func_retval0+8],
-; CHECK-DAG:        st.param.b32    [func_retval0+9],
-; CHECK-DAG:        st.param.b32    [func_retval0+13],
-; CHECK-DAG:        st.param.b64    [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+9],
+; CHECK-DAG:        st.param.b8     [func_retval0+10],
+; CHECK-DAG:        st.param.b8     [func_retval0+11],
+; CHECK-DAG:        st.param.b8     [func_retval0+12],
+; CHECK-DAG:        st.param.b8     [func_retval0+13],
+; CHECK-DAG:        st.param.b8     [func_retval0+14],
+; CHECK-DAG:        st.param.b8     [func_retval0+15],
+; CHECK-DAG:        st.param.b8     [func_retval0+16],
+; CHECK-DAG:        st.param.b8     [func_retval0+17],
+; CHECK-DAG:        st.param.b8     [func_retval0+18],
+; CHECK-DAG:        st.param.b8     [func_retval0+19],
+; CHECK-DAG:        st.param.b8     [func_retval0+20],
+; CHECK-DAG:        st.param.b8     [func_retval0+21],
+; CHECK-DAG:        st.param.b8     [func_retval0+22],
+; CHECK-DAG:        st.param.b8     [func_retval0+23],
+; CHECK-DAG:        st.param.b8     [func_retval0+24],
 
 define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
        %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
diff --git a/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
new file mode 100644
index 0000000..40a3e9e
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/unaligned-param-load-store.ll
@@ -0,0 +1,385 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck -allow-deprecated-dag-overlap %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | %ptxas-verify %}
+
+%s_i8i16p = type { <{ i16, i8, i16 }>, i64 }
+%s_i8i32p = type { <{ i32, i8, i32 }>, i64 }
+%s_i8i64p = type { <{ i64, i8, i64 }>, i64 }
+%s_i8f16p = type { <{ half, i8, half }>, i64 }
+%s_i8f16x2p = type { <{ <2 x half>, i8, <2 x half> }>, i64 }
+%s_i8f32p = type { <{ float, i8, float }>, i64 }
+%s_i8f64p = type { <{ double, i8, double }>, i64 }
+
+; -- All loads/stores from parameters aligned by one must be done one
+;    byte at a time.
+; -- Notes:
+;   -- There are two fields of interest in the packed part of the struct, one
+;      with a proper offset and one without. The former should be loaded or
+;      stored as a whole, and the latter by bytes.
+;   -- Only loading and storing the said fields are checked in the following
+;      series of tests so that they are more concise.
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8i16p(
+; CHECK:        .param .align 8 .b8 test_s_i8i16p_param_0[16]
+; CHECK-DAG:    ld.param.u16 [[P0:%rs[0-9]+]],   [test_s_i8i16p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rs[0-9]+]],   [test_s_i8i16p_param_0+3];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rs[0-9]+]],   [test_s_i8i16p_param_0+4];
+; CHECK-DAG:    shl.b16     [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16      [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2_1_shl:%rs[0-9]+]], [[R2_1]], 8;
+; CHECK-DAG:    and.b16      [[R2_0_and:%rs[0-9]+]], [[R2_0]], 255;
+; CHECK-DAG:    or.b16       [[R2:%rs[0-9]+]], [[R2_0_and]], [[R2_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3], [[R2]];
+; CHECK-DAG:    and.b16      [[R2_1_and:%rs[0-9]+]], [[R2_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4], [[R2_1_and]];
+; CHECK:        ret;
+
+define %s_i8i16p @test_s_i8i16p(%s_i8i16p %a) {
+       %r = tail call %s_i8i16p @test_s_i8i16p(%s_i8i16p %a)
+       ret %s_i8i16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8i32p(
+; CHECK:        .param .align 8 .b8 test_s_i8i32p_param_0[24]
+; CHECK-DAG:    ld.param.u32 [[P0:%r[0-9]+]],   [test_s_i8i32p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%r[0-9]+]],   [test_s_i8i32p_param_0+5];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%r[0-9]+]],   [test_s_i8i32p_param_0+6];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%r[0-9]+]],   [test_s_i8i32p_param_0+7];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%r[0-9]+]],   [test_s_i8i32p_param_0+8];
+; CHECK-DAG:    shl.b32     [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32     [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32     [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32      [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32      [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32      [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32     [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32     [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8i32p @test_s_i8i32p(%s_i8i32p %a) {
+       %r = tail call %s_i8i32p @test_s_i8i32p(%s_i8i32p %a)
+       ret %s_i8i32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8i64p(
+; CHECK:        .param .align 8 .b8 test_s_i8i64p_param_0[32]
+; CHECK-DAG:    ld.param.u64 [[P0:%rd[0-9]+]],   [test_s_i8i64p_param_0];
+; CHECK-DAG:    ld.param.u8 [[P2_0:%rd[0-9]+]],   [test_s_i8i64p_param_0+9];
+; CHECK-DAG:    ld.param.u8 [[P2_1:%rd[0-9]+]],   [test_s_i8i64p_param_0+10];
+; CHECK-DAG:    ld.param.u8 [[P2_2:%rd[0-9]+]],   [test_s_i8i64p_param_0+11];
+; CHECK-DAG:    ld.param.u8 [[P2_3:%rd[0-9]+]],   [test_s_i8i64p_param_0+12];
+; CHECK-DAG:    ld.param.u8 [[P2_4:%rd[0-9]+]],   [test_s_i8i64p_param_0+13];
+; CHECK-DAG:    ld.param.u8 [[P2_5:%rd[0-9]+]],   [test_s_i8i64p_param_0+14];
+; CHECK-DAG:    ld.param.u8 [[P2_6:%rd[0-9]+]],   [test_s_i8i64p_param_0+15];
+; CHECK-DAG:    ld.param.u8 [[P2_7:%rd[0-9]+]],   [test_s_i8i64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.b64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8i64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b64 [[R0:%rd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8i64p @test_s_i8i64p(%s_i8i64p %a) {
+       %r = tail call %s_i8i64p @test_s_i8i64p(%s_i8i64p %a)
+       ret %s_i8i64p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[16])
+; CHECK-LABEL: test_s_i8f16p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16p_param_0[16]
+; CHECK-DAG:    ld.param.b16 [[P0:%rs[0-9]+]],     [test_s_i8f16p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rs[0-9]+]],   [test_s_i8f16p_param_0+3];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rs[0-9]+]],   [test_s_i8f16p_param_0+4];
+; CHECK-DAG:    shl.b16      [[P2_1_shl:%rs[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    or.b16       [[P2_1_or:%rs[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[16];
+; CHECK-DAG:    st.param.b16 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+3], [[P2_1_or]];
+; CHECK-DAG:    st.param.b8  [param0+4], [[P2_1]];
+; CHECK:        .param .align 8 .b8 retval0[16];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b16 [[R0:%rs[0-9]+]],     [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2I_0:%rs[0-9]+]], [retval0+3];
+; CHECK-DAG:    ld.param.b8  [[R2I_1:%rs[0-9]+]], [retval0+4];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b16 [func_retval0+0], [[R0]];
+; CHECK-DAG:    shl.b16      [[R2I_1_shl:%rs[0-9]+]], [[R2I_1]], 8;
+; CHECK-DAG:    and.b16      [[R2I_0_and:%rs[0-9]+]], [[R2I_0]], 255;
+; CHECK-DAG:    or.b16       [[R2I:%rs[0-9]+]], [[R2I_0_and]], [[R2I_1_shl]];
+; CHECK-DAG:    st.param.b8  [func_retval0+3],  [[R2I]];
+; CHECK-DAG:    and.b16      [[R2I_1_and:%rs[0-9]+]], [[R2I_1]], 255;
+; CHECK-DAG:    st.param.b8  [func_retval0+4],  [[R2I_1_and]];
+; CHECK:        ret;
+
+define %s_i8f16p @test_s_i8f16p(%s_i8f16p %a) {
+       %r = tail call %s_i8f16p @test_s_i8f16p(%s_i8f16p %a)
+       ret %s_i8f16p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f16x2p(
+; CHECK:        .param .align 8 .b8 test_s_i8f16x2p_param_0[24]
+; CHECK-DAG:    ld.param.b32 [[P0:%r[0-9]+]],  [test_s_i8f16x2p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f16x2p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f16x2p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f16x2p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f16x2p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f16x2p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.b32 [[R0:%r[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.b32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a) {
+       %r = tail call %s_i8f16x2p @test_s_i8f16x2p(%s_i8f16x2p %a)
+       ret %s_i8f16x2p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i8f32p(
+; CHECK:        .param .align 8 .b8 test_s_i8f32p_param_0[24]
+; CHECK-DAG:    ld.param.f32 [[P0:%f[0-9]+]],    [test_s_i8f32p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%r[0-9]+]],   [test_s_i8f32p_param_0+5];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%r[0-9]+]],   [test_s_i8f32p_param_0+6];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%r[0-9]+]],   [test_s_i8f32p_param_0+7];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%r[0-9]+]],   [test_s_i8f32p_param_0+8];
+; CHECK-DAG:    shl.b32      [[P2_1_shl:%r[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b32      [[P2_2_shl:%r[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b32      [[P2_3_shl:%r[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b32       [[P2_or:%r[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b32       [[P2_or_1:%r[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b32       [[P2:%r[0-9]+]], [[P2_or_1]], [[P2_or]];
+; CHECK-DAG:    shr.u32      [[P2_1_shr:%r[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u32      [[P2_2_shr:%r[0-9]+]], [[P2_or_1]], 16;
+; CHECK:        { // callseq
+; CHECK-DAG:    .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.f32 [param0+0], [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+5], [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+6], [[P2_1_shr]];
+; CHECK-DAG:    st.param.b8  [param0+7], [[P2_2_shr]];
+; CHECK-DAG:    st.param.b8  [param0+8], [[P2_3]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f32p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f32 [[R0:%f[0-9]+]],    [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+5];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+6];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+7];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+8];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f32 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+5],
+; CHECK-DAG:    st.param.b8  [func_retval0+6],
+; CHECK-DAG:    st.param.b8  [func_retval0+7],
+; CHECK-DAG:    st.param.b8  [func_retval0+8],
+; CHECK:        ret;
+
+define %s_i8f32p @test_s_i8f32p(%s_i8f32p %a) {
+       %r = tail call %s_i8f32p @test_s_i8f32p(%s_i8f32p %a)
+       ret %s_i8f32p %r
+}
+
+; CHECK:       .visible .func (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i8f64p(
+; CHECK:        .param .align 8 .b8 test_s_i8f64p_param_0[32]
+; CHECK-DAG:    ld.param.f64 [[P0:%fd[0-9]+]],    [test_s_i8f64p_param_0];
+; CHECK-DAG:    ld.param.u8  [[P2_0:%rd[0-9]+]],   [test_s_i8f64p_param_0+9];
+; CHECK-DAG:    ld.param.u8  [[P2_1:%rd[0-9]+]],   [test_s_i8f64p_param_0+10];
+; CHECK-DAG:    ld.param.u8  [[P2_2:%rd[0-9]+]],   [test_s_i8f64p_param_0+11];
+; CHECK-DAG:    ld.param.u8  [[P2_3:%rd[0-9]+]],   [test_s_i8f64p_param_0+12];
+; CHECK-DAG:    ld.param.u8  [[P2_4:%rd[0-9]+]],   [test_s_i8f64p_param_0+13];
+; CHECK-DAG:    ld.param.u8  [[P2_5:%rd[0-9]+]],   [test_s_i8f64p_param_0+14];
+; CHECK-DAG:    ld.param.u8  [[P2_6:%rd[0-9]+]],   [test_s_i8f64p_param_0+15];
+; CHECK-DAG:    ld.param.u8  [[P2_7:%rd[0-9]+]],   [test_s_i8f64p_param_0+16];
+; CHECK-DAG:    shl.b64      [[P2_1_shl:%rd[0-9]+]], [[P2_1]], 8;
+; CHECK-DAG:    shl.b64      [[P2_2_shl:%rd[0-9]+]], [[P2_2]], 16;
+; CHECK-DAG:    shl.b64      [[P2_3_shl:%rd[0-9]+]], [[P2_3]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_0:%rd[0-9]+]], [[P2_1_shl]], [[P2_0]];
+; CHECK-DAG:    or.b64       [[P2_or_1:%rd[0-9]+]], [[P2_3_shl]], [[P2_2_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_2:%rd[0-9]+]], [[P2_or_1]], [[P2_or_0]];
+; CHECK-DAG:    shl.b64 	 [[P2_5_shl:%rd[0-9]+]], [[P2_5]], 8;
+; CHECK-DAG:    shl.b64      [[P2_6_shl:%rd[0-9]+]], [[P2_6]], 16;
+; CHECK-DAG:    shl.b64      [[P2_7_shl:%rd[0-9]+]], [[P2_7]], 24;
+; CHECK-DAG:    or.b64       [[P2_or_3:%rd[0-9]+]], [[P2_5_shl]], [[P2_4]];
+; CHECK-DAG:    or.b64       [[P2_or_4:%rd[0-9]+]], [[P2_7_shl]], [[P2_6_shl]];
+; CHECK-DAG:    or.b64       [[P2_or_5:%rd[0-9]+]], [[P2_or_4]], [[P2_or_3]];
+; CHECK-DAG:    shl.b64      [[P2_or_shl:%rd[0-9]+]], [[P2_or_5]], 32;
+; CHECK-DAG:    or.b64       [[P2:%rd[0-9]+]], [[P2_or_shl]], [[P2_or_2]];
+; CHECK-DAG:    shr.u64      [[P2_shr_1:%rd[0-9]+]], [[P2]], 8;
+; CHECK-DAG:    shr.u64      [[P2_shr_2:%rd[0-9]+]], [[P2]], 16;
+; CHECK-DAG:    shr.u64      [[P2_shr_3:%rd[0-9]+]], [[P2]], 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_4:%rd[0-9]+]], [[P2_or_5]], 8, 24;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_5:%rd[0-9]+]], [[P2_or_5]], 16, 16;
+; CHECK-DAG:    bfe.u64      [[P2_bfe_6:%rd[0-9]+]], [[P2_or_5]], 24, 8;
+; CHECK:        { // callseq
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK-DAG:    st.param.f64 [param0+0],  [[P0]];
+; CHECK-DAG:    st.param.b8  [param0+9],  [[P2]];
+; CHECK-DAG:    st.param.b8  [param0+10], [[P2_shr_1]];
+; CHECK-DAG:    st.param.b8  [param0+11], [[P2_shr_2]];
+; CHECK-DAG:    st.param.b8  [param0+12], [[P2_shr_3]];
+; CHECK-DAG:    st.param.b8  [param0+13], [[P2_or_5]];
+; CHECK-DAG:    st.param.b8  [param0+14], [[P2_bfe_4]];
+; CHECK-DAG:    st.param.b8  [param0+15], [[P2_bfe_5]];
+; CHECK-DAG:    st.param.b8  [param0+16], [[P2_bfe_6]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK-NEXT:   call.uni (retval0),
+; CHECK-NEXT:   test_s_i8f64p,
+; CHECK-NEXT:   (
+; CHECK-NEXT:   param0
+; CHECK-NEXT:   );
+; CHECK-DAG:    ld.param.f64 [[R0:%fd[0-9]+]],   [retval0+0];
+; CHECK-DAG:    ld.param.b8  [[R2_0:%rs[0-9]+]], [retval0+9];
+; CHECK-DAG:    ld.param.b8  [[R2_1:%rs[0-9]+]], [retval0+10];
+; CHECK-DAG:    ld.param.b8  [[R2_2:%rs[0-9]+]], [retval0+11];
+; CHECK-DAG:    ld.param.b8  [[R2_3:%rs[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b8  [[R2_4:%rs[0-9]+]], [retval0+13];
+; CHECK-DAG:    ld.param.b8  [[R2_5:%rs[0-9]+]], [retval0+14];
+; CHECK-DAG:    ld.param.b8  [[R2_6:%rs[0-9]+]], [retval0+15];
+; CHECK-DAG:    ld.param.b8  [[R2_7:%rs[0-9]+]], [retval0+16];
+; CHECK:        } // callseq
+; CHECK-DAG:    st.param.f64 [func_retval0+0], [[R0]];
+; CHECK-DAG:    st.param.b8  [func_retval0+9],
+; CHECK-DAG:    st.param.b8  [func_retval0+10],
+; CHECK-DAG:    st.param.b8  [func_retval0+11],
+; CHECK-DAG:    st.param.b8  [func_retval0+12],
+; CHECK-DAG:    st.param.b8  [func_retval0+13],
+; CHECK-DAG:    st.param.b8  [func_retval0+14],
+; CHECK-DAG:    st.param.b8  [func_retval0+15],
+; CHECK-DAG:    st.param.b8  [func_retval0+16],
+; CHECK:        ret;
+
+define %s_i8f64p @test_s_i8f64p(%s_i8f64p %a) {
+       %r = tail call %s_i8f64p @test_s_i8f64p(%s_i8f64p %a)
+       ret %s_i8f64p %r
+}
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 91e7399..3c2e846 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I %s
+; RUN:   | FileCheck -check-prefix=RV32I %s
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV32I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV32I-MEDIUM %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I %s
+; RUN:   | FileCheck -check-prefix=RV64I %s
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs -code-model=medium < %s \
-; RUN:   -riscv-enable-sink-fold | FileCheck -check-prefix=RV64I-MEDIUM %s
+; RUN:   | FileCheck -check-prefix=RV64I-MEDIUM %s
 
 ; We can often fold an ADDI into the offset of load/store instructions:
 ;   (load (addi base, off1), off2) -> (load base, off1+off2)
diff --git a/llvm/test/CodeGen/RISCV/make-compressible.mir b/llvm/test/CodeGen/RISCV/make-compressible.mir
index 2105a13..03da38a 100644
--- a/llvm/test/CodeGen/RISCV/make-compressible.mir
+++ b/llvm/test/CodeGen/RISCV/make-compressible.mir
@@ -1,8 +1,14 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -o - %s -mtriple=riscv32 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV32 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32C %s
 # RUN: llc -o - %s -mtriple=riscv64 -mattr=+c,+f,+d -simplify-mir \
-# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefix=RV64 %s
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64C %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zcf -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCF %s
+# RUN: llc -o - %s -mtriple=riscv32 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV32,RV32ZCA %s
+# RUN: llc -o - %s -mtriple=riscv64 -mattr=+d,+zca -simplify-mir \
+# RUN:   -run-pass=riscv-make-compressible | FileCheck --check-prefixes=RV64,RV64ZCA %s
 --- |
 
   define void @store_common_value(ptr %a, ptr %b, ptr %c) #0 {
@@ -288,7 +294,7 @@
     ret { double, double } %3
   }
 
-  attributes #0 = { minsize "target-features"="+c,+f,+d" }
+  attributes #0 = { minsize }
 
 ...
 ---
@@ -306,6 +312,7 @@ body:             |
     ; RV32-NEXT: SW $x13, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV32-NEXT: SW $x13, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value
     ; RV64: liveins: $x10, $x11, $x12
     ; RV64-NEXT: {{  $}}
@@ -327,14 +334,15 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_f
 
-    ; RV32-LABEL: name: store_common_value_float
-    ; RV32: liveins: $x10, $x11, $x12, $f16_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
-    ; RV32-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
-    ; RV32-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_float
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32C-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32C-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float
     ; RV64: liveins: $x10, $x11, $x12, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -342,6 +350,23 @@ body:             |
     ; RV64-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     ; RV64-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_float
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $f15_f = FSGNJ_S $f16_f, $f16_f
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCF-NEXT: FSW $f15_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCF-NEXT: FSW killed $f15_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_float
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
+    ; RV32ZCA-NEXT: FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
+    ; RV32ZCA-NEXT: FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     FSW renamable $f16_f, killed renamable $x11, 0 :: (store (s32) into %ir.b)
     FSW killed renamable $f16_f, killed renamable $x12, 0 :: (store (s32) into %ir.c)
@@ -355,22 +380,47 @@ body:             |
   bb.0.entry:
     liveins: $x10, $x11, $x12, $f16_d
 
-    ; RV32-LABEL: name: store_common_value_double
-    ; RV32: liveins: $x10, $x11, $x12, $f16_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV32-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV32-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_value_double
-    ; RV64: liveins: $x10, $x11, $x12, $f16_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
-    ; RV64-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
-    ; RV64-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_value_double
+    ; RV32C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_value_double
+    ; RV64C: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $f15_d = FSGNJ_D $f16_d, $f16_d
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64C-NEXT: FSD $f15_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64C-NEXT: FSD killed $f15_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_value_double
+    ; RV32ZCF: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCF-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCF-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_value_double
+    ; RV32ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV32ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV32ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_value_double
+    ; RV64ZCA: liveins: $x10, $x11, $x12, $f16_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
+    ; RV64ZCA-NEXT: FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
+    ; RV64ZCA-NEXT: FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     FSD renamable $f16_d, killed renamable $x11, 0 :: (store (s64) into %ir.b)
     FSD killed renamable $f16_d, killed renamable $x12, 0 :: (store (s64) into %ir.c)
@@ -395,6 +445,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 5
     ; RV32-NEXT: SW killed renamable $x10, killed $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -432,6 +483,7 @@ body:             |
     ; RV32-NEXT: SW killed renamable $x10, $x11, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: SW killed $x11, $x11, 0 :: (volatile store (s32) into %ir.q)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_self
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -457,14 +509,15 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_f, $f11_f, $f12_f
 
-    ; RV32-LABEL: name: store_common_ptr_float
-    ; RV32: liveins: $x16, $f10_f, $f11_f, $f12_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_float
+    ; RV32C: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float
     ; RV64: liveins: $x16, $f10_f, $f11_f, $f12_f
     ; RV64-NEXT: {{  $}}
@@ -472,6 +525,23 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_float
+    ; RV32ZCF: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, killed $x10, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_float
+    ; RV32ZCA: liveins: $x16, $f10_f, $f11_f, $f12_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f11_f, renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     FSW killed renamable $f12_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
@@ -485,22 +555,47 @@ body:             |
   bb.0.entry:
     liveins: $x16, $f10_d, $f11_d, $f12_d
 
-    ; RV32-LABEL: name: store_common_ptr_double
-    ; RV32: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_common_ptr_double
-    ; RV64: liveins: $x16, $f10_d, $f11_d, $f12_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_common_ptr_double
+    ; RV32C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_common_ptr_double
+    ; RV64C: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, killed $x10, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_common_ptr_double
+    ; RV32ZCF: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_common_ptr_double
+    ; RV32ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_common_ptr_double
+    ; RV64ZCA: liveins: $x16, $f10_d, $f11_d, $f12_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f11_d, renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     FSD killed renamable $f12_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
@@ -522,6 +617,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x10 = LW $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x11, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -543,14 +639,15 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_float
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
-    ; RV32-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_common_ptr_float
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -558,6 +655,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_float
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x10 = ADDI $x16, 0
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x10, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x10, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x10, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_float
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x16, 0 :: (load (s32) from %ir.g)
     renamable $f11_f = FLW renamable $x16, 4 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x16, 8 :: (load (s32) from %ir.arrayidx2)
@@ -571,22 +685,47 @@ body:             |
   bb.0.entry:
     liveins: $x16
 
-    ; RV32-LABEL: name: load_common_ptr_double
-    ; RV32: liveins: $x16
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x10 = ADDI $x16, 0
-    ; RV32-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV32-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_common_ptr_double
-    ; RV64: liveins: $x16
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x10 = ADDI $x16, 0
-    ; RV64-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
-    ; RV64-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_common_ptr_double
+    ; RV32C: liveins: $x16
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x10 = ADDI $x16, 0
+    ; RV32C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_common_ptr_double
+    ; RV64C: liveins: $x16
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x10 = ADDI $x16, 0
+    ; RV64C-NEXT: renamable $f10_d = FLD $x10, 0 :: (load (s64) from %ir.g)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x10, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x10, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_common_ptr_double
+    ; RV32ZCF: liveins: $x16
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_common_ptr_double
+    ; RV32ZCA: liveins: $x16
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_common_ptr_double
+    ; RV64ZCA: liveins: $x16
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_common_ptr_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x16, 0 :: (load (s64) from %ir.g)
     renamable $f11_d = FLD renamable $x16, 8 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x16, 16 :: (load (s64) from %ir.arrayidx2)
@@ -613,6 +752,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 7
     ; RV32-NEXT: SW killed renamable $x11, killed $x12, 28 :: (volatile store (s32) into %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -644,15 +784,16 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
 
-    ; RV32-LABEL: name: store_large_offset_float
-    ; RV32: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
-    ; RV32-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
-    ; RV32-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
-    ; RV32-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
-    ; RV32-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_float
+    ; RV32C: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32C-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32C-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32C-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float
     ; RV64: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
     ; RV64-NEXT: {{  $}}
@@ -661,6 +802,25 @@ body:             |
     ; RV64-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
     ; RV64-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
     ; RV64-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_float
+    ; RV32ZCF: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: FSW killed renamable $f10_f, $x11, 16 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCF-NEXT: FSW killed renamable $f11_f, $x11, 20 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCF-NEXT: FSW killed renamable $f12_f, $x11, 24 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCF-NEXT: FSW killed renamable $f13_f, killed $x11, 28 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_float
+    ; RV32ZCA: liveins: $x10, $f10_f, $f11_f, $f12_f, $f13_f
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
+    ; RV32ZCA-NEXT: FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
+    ; RV32ZCA-NEXT: FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
+    ; RV32ZCA-NEXT: FSW killed renamable $f13_f, killed renamable $x10, 412 :: (volatile store (s32) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
     FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     FSW killed renamable $f11_f, renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     FSW killed renamable $f12_f, renamable $x10, 408 :: (volatile store (s32) into %ir.2)
@@ -675,24 +835,52 @@ body:             |
   bb.0.entry:
     liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
 
-    ; RV32-LABEL: name: store_large_offset_double
-    ; RV32: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV32-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV32-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV32-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV32-NEXT: PseudoRET
-    ; RV64-LABEL: name: store_large_offset_double
-    ; RV64: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
-    ; RV64-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
-    ; RV64-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
-    ; RV64-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
-    ; RV64-NEXT: PseudoRET
+    ; RV32C-LABEL: name: store_large_offset_double
+    ; RV32C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV32C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV32C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV32C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV32C-NEXT: PseudoRET
+    ;
+    ; RV64C-LABEL: name: store_large_offset_double
+    ; RV64C: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: FSD killed renamable $f10_d, $x11, 32 :: (volatile store (s64) into %ir.0)
+    ; RV64C-NEXT: FSD killed renamable $f11_d, $x11, 40 :: (volatile store (s64) into %ir.1)
+    ; RV64C-NEXT: FSD killed renamable $f12_d, $x11, 48 :: (volatile store (s64) into %ir.2)
+    ; RV64C-NEXT: FSD killed renamable $f13_d, killed $x11, 56 :: (volatile store (s64) into %ir.3)
+    ; RV64C-NEXT: PseudoRET
+    ;
+    ; RV32ZCF-LABEL: name: store_large_offset_double
+    ; RV32ZCF: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCF-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCF-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCF-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCF-NEXT: PseudoRET
+    ;
+    ; RV32ZCA-LABEL: name: store_large_offset_double
+    ; RV32ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV32ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV32ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV32ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV32ZCA-NEXT: PseudoRET
+    ;
+    ; RV64ZCA-LABEL: name: store_large_offset_double
+    ; RV64ZCA: liveins: $x10, $f10_d, $f11_d, $f12_d, $f13_d
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
+    ; RV64ZCA-NEXT: FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
+    ; RV64ZCA-NEXT: FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
+    ; RV64ZCA-NEXT: FSD killed renamable $f13_d, killed renamable $x10, 824 :: (volatile store (s64) into %ir.3)
+    ; RV64ZCA-NEXT: PseudoRET
     FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     FSD killed renamable $f11_d, renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     FSD killed renamable $f12_d, renamable $x10, 816 :: (volatile store (s64) into %ir.2)
@@ -716,6 +904,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW $x12, 24 :: (volatile load (s32) from %ir.2)
     ; RV32-NEXT: dead renamable $x10 = LW killed $x12, 28 :: (volatile load (s32) from %ir.3)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -739,14 +928,15 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_float
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 384
-    ; RV32-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ; RV32C-LABEL: name: load_large_offset_float
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 384
+    ; RV32C-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -754,6 +944,23 @@ body:             |
     ; RV64-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV64-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
     ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_float
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: $x11 = ADDI $x10, 384
+    ; RV32ZCF-NEXT: renamable $f10_f = FLW $x11, 16 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_f = FLW $x11, 20 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_f = FLW killed $x11, 24 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_float
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_float_1, implicit $x2, implicit $f10_f, implicit $f11_f, implicit $f12_f
     renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     renamable $f11_f = FLW renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     renamable $f12_f = FLW killed renamable $x10, 408 :: (load (s32) from %ir.arrayidx2)
@@ -767,22 +974,47 @@ body:             |
   bb.0.entry:
     liveins: $x10
 
-    ; RV32-LABEL: name: load_large_offset_double
-    ; RV32: liveins: $x10
-    ; RV32-NEXT: {{  $}}
-    ; RV32-NEXT: $x11 = ADDI $x10, 768
-    ; RV32-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV32-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV32-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV32-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
-    ; RV64-LABEL: name: load_large_offset_double
-    ; RV64: liveins: $x10
-    ; RV64-NEXT: {{  $}}
-    ; RV64-NEXT: $x11 = ADDI $x10, 768
-    ; RV64-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
-    ; RV64-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
-    ; RV64-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
-    ; RV64-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ; RV32C-LABEL: name: load_large_offset_double
+    ; RV32C: liveins: $x10
+    ; RV32C-NEXT: {{  $}}
+    ; RV32C-NEXT: $x11 = ADDI $x10, 768
+    ; RV32C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV32C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV32C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV32C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64C-LABEL: name: load_large_offset_double
+    ; RV64C: liveins: $x10
+    ; RV64C-NEXT: {{  $}}
+    ; RV64C-NEXT: $x11 = ADDI $x10, 768
+    ; RV64C-NEXT: renamable $f10_d = FLD $x11, 32 :: (load (s64) from %ir.arrayidx)
+    ; RV64C-NEXT: renamable $f11_d = FLD $x11, 40 :: (load (s64) from %ir.arrayidx1)
+    ; RV64C-NEXT: renamable $f12_d = FLD killed $x11, 48 :: (load (s64) from %ir.arrayidx2)
+    ; RV64C-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCF-LABEL: name: load_large_offset_double
+    ; RV32ZCF: liveins: $x10
+    ; RV32ZCF-NEXT: {{  $}}
+    ; RV32ZCF-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCF-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCF-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCF-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV32ZCA-LABEL: name: load_large_offset_double
+    ; RV32ZCA: liveins: $x10
+    ; RV32ZCA-NEXT: {{  $}}
+    ; RV32ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV32ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV32ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV32ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
+    ;
+    ; RV64ZCA-LABEL: name: load_large_offset_double
+    ; RV64ZCA: liveins: $x10
+    ; RV64ZCA-NEXT: {{  $}}
+    ; RV64ZCA-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
+    ; RV64ZCA-NEXT: renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
+    ; RV64ZCA-NEXT: renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
+    ; RV64ZCA-NEXT: PseudoTAIL target-flags(riscv-call) @load_large_offset_double_1, implicit $x2, implicit $f10_d, implicit $f11_d, implicit $f12_d
     renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     renamable $f11_d = FLD renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     renamable $f12_d = FLD killed renamable $x10, 816 :: (load (s64) from %ir.arrayidx2)
@@ -801,6 +1033,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: SW $x0, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -822,6 +1055,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f16_f, killed renamable $x10, 0 :: (store (s32) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_float_no_opt
     ; RV64: liveins: $x10, $f16_f
     ; RV64-NEXT: {{  $}}
@@ -843,6 +1077,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f16_d, killed renamable $x10, 0 :: (store (s64) into %ir.a)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_value_double_no_opt
     ; RV64: liveins: $x10, $f16_d
     ; RV64-NEXT: {{  $}}
@@ -865,6 +1100,7 @@ body:             |
     ; RV32-NEXT: renamable $x10 = ADDI $x0, 1
     ; RV32-NEXT: SW killed renamable $x10, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -888,6 +1124,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSW killed renamable $f10_f, killed renamable $x16, 0 :: (volatile store (s32) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_float_no_opt
     ; RV64: liveins: $x16, $f10_f
     ; RV64-NEXT: {{  $}}
@@ -909,6 +1146,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: FSD killed renamable $f10_d, killed renamable $x16, 0 :: (volatile store (s64) into %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_common_ptr_double_no_opt
     ; RV64: liveins: $x16, $f10_d
     ; RV64-NEXT: {{  $}}
@@ -930,6 +1168,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x16, 0 :: (volatile load (s32) from %ir.p)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_common_ptr_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -951,6 +1190,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_f = FLW killed renamable $x16, 0 :: (load (s32) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_f
+    ;
     ; RV64-LABEL: name: load_common_ptr_float_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -972,6 +1212,7 @@ body:             |
     ; RV32-NEXT: {{  $}}
     ; RV32-NEXT: renamable $f10_d = FLD killed renamable $x16, 0 :: (load (s64) from %ir.g)
     ; RV32-NEXT: PseudoRET implicit $f10_d
+    ;
     ; RV64-LABEL: name: load_common_ptr_double_no_opt
     ; RV64: liveins: $x16
     ; RV64-NEXT: {{  $}}
@@ -996,6 +1237,7 @@ body:             |
     ; RV32-NEXT: renamable $x11 = ADDI $x0, 3
     ; RV32-NEXT: SW killed renamable $x11, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1024,6 +1266,7 @@ body:             |
     ; RV32-NEXT: FSW killed renamable $f10_f, renamable $x10, 400 :: (volatile store (s32) into %ir.0)
     ; RV32-NEXT: FSW killed renamable $f11_f, killed renamable $x10, 404 :: (volatile store (s32) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_float_no_opt
     ; RV64: liveins: $x10, $f10_f, $f11_f
     ; RV64-NEXT: {{  $}}
@@ -1048,6 +1291,7 @@ body:             |
     ; RV32-NEXT: FSD killed renamable $f10_d, renamable $x10, 800 :: (volatile store (s64) into %ir.0)
     ; RV32-NEXT: FSD killed renamable $f11_d, killed renamable $x10, 808 :: (volatile store (s64) into %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: store_large_offset_double_no_opt
     ; RV64: liveins: $x10, $f10_d, $f11_d
     ; RV64-NEXT: {{  $}}
@@ -1072,6 +1316,7 @@ body:             |
     ; RV32-NEXT: dead renamable $x11 = LW renamable $x10, 400 :: (volatile load (s32) from %ir.0)
     ; RV32-NEXT: dead renamable $x10 = LW killed renamable $x10, 404 :: (volatile load (s32) from %ir.1)
     ; RV32-NEXT: PseudoRET
+    ;
     ; RV64-LABEL: name: load_large_offset_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1096,6 +1341,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_f = FLW renamable $x10, 400 :: (load (s32) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_f = FLW killed renamable $x10, 404 :: (load (s32) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_f, implicit $f11_f
+    ;
     ; RV64-LABEL: name: load_large_offset_float_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
@@ -1120,6 +1366,7 @@ body:             |
     ; RV32-NEXT: renamable $f10_d = FLD renamable $x10, 800 :: (load (s64) from %ir.arrayidx)
     ; RV32-NEXT: renamable $f11_d = FLD killed renamable $x10, 808 :: (load (s64) from %ir.arrayidx1)
     ; RV32-NEXT: PseudoRET implicit $f10_d, implicit $f11_d
+    ;
     ; RV64-LABEL: name: load_large_offset_double_no_opt
     ; RV64: liveins: $x10
     ; RV64-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/RISCV/pr51206.ll b/llvm/test/CodeGen/RISCV/pr51206.ll
index f54031a..8aa145f 100644
--- a/llvm/test/CodeGen/RISCV/pr51206.ll
+++ b/llvm/test/CodeGen/RISCV/pr51206.ll
@@ -27,16 +27,12 @@ define signext i32 @wobble() nounwind {
 ; CHECK-NEXT:    lui a2, %hi(global.3)
 ; CHECK-NEXT:    li a3, 5
 ; CHECK-NEXT:    sw a1, %lo(global.3)(a2)
-; CHECK-NEXT:    bltu a0, a3, .LBB0_2
-; CHECK-NEXT:  # %bb.1: # %bb10
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; CHECK-NEXT:    call quux
-; CHECK-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:  .LBB0_2: # %bb12
+; CHECK-NEXT:    bgeu a0, a3, .LBB0_2
+; CHECK-NEXT:  # %bb.1: # %bb12
 ; CHECK-NEXT:    li a0, 0
 ; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB0_2: # %bb10
+; CHECK-NEXT:    tail quux
 bb:
   %tmp = load i8, ptr @global, align 1
   %tmp1 = zext i8 %tmp to i32
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 1724b48..a09ab3e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32V
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+v -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64V
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=ilp32d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32,RV32ZVE32F
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+zvfh,+zve32f,+zvl128b -target-abi=lp64d \
-; RUN:     -riscv-enable-sink-fold -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64,RV64ZVE32F
 
 declare <1 x i8> @llvm.masked.gather.v1i8.v1p0(<1 x ptr>, i32, <1 x i1>, <1 x i8>)
 
@@ -15086,31 +15086,27 @@ define <32 x i64> @mgather_strided_split(ptr %base) {
   ret <32 x i64> %x
 }
 
-; FIXME: This is a miscompile triggered by the mgather ->
-; riscv.masked.strided.load combine. In order for it to trigger we need either a
-; strided gather that RISCVGatherScatterLowering doesn't pick up, or a new
-; strided gather generated by the widening sew combine.
 define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV32V-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV32V:       # %bb.0:
-; RV32V-NEXT:    addi a0, a0, -128
-; RV32V-NEXT:    li a1, -128
+; RV32V-NEXT:    addi a0, a0, 136
+; RV32V-NEXT:    li a1, -136
 ; RV32V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV32V-NEXT:    vlse64.v v8, (a0), a1
 ; RV32V-NEXT:    ret
 ;
 ; RV64V-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV64V:       # %bb.0:
-; RV64V-NEXT:    addi a0, a0, -128
-; RV64V-NEXT:    li a1, -128
+; RV64V-NEXT:    addi a0, a0, 136
+; RV64V-NEXT:    li a1, -136
 ; RV64V-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; RV64V-NEXT:    vlse64.v v8, (a0), a1
 ; RV64V-NEXT:    ret
 ;
 ; RV32ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV32ZVE32F:       # %bb.0:
-; RV32ZVE32F-NEXT:    lui a1, 16392
-; RV32ZVE32F-NEXT:    addi a1, a1, 1152
+; RV32ZVE32F-NEXT:    lui a1, 16393
+; RV32ZVE32F-NEXT:    addi a1, a1, -888
 ; RV32ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; RV32ZVE32F-NEXT:    vmv.s.x v9, a1
 ; RV32ZVE32F-NEXT:    vluxei8.v v8, (a0), v9
@@ -15118,8 +15114,8 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ;
 ; RV64ZVE32F-LABEL: masked_gather_widen_sew_negative_stride:
 ; RV64ZVE32F:       # %bb.0:
-; RV64ZVE32F-NEXT:    addi a1, a0, 128
-; RV64ZVE32F-NEXT:    lw a2, 132(a0)
+; RV64ZVE32F-NEXT:    addi a1, a0, 136
+; RV64ZVE32F-NEXT:    lw a2, 140(a0)
 ; RV64ZVE32F-NEXT:    lw a3, 0(a0)
 ; RV64ZVE32F-NEXT:    lw a0, 4(a0)
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
@@ -15128,7 +15124,7 @@ define <4 x i32> @masked_gather_widen_sew_negative_stride(ptr %base) {
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a3
 ; RV64ZVE32F-NEXT:    vslide1down.vx v8, v8, a0
 ; RV64ZVE32F-NEXT:    ret
-  %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 32, i64 33, i64 0, i64 1>
+  %ptrs = getelementptr i32, ptr %base, <4 x i64> <i64 34, i64 35, i64 0, i64 1>
   %x = call <4 x i32> @llvm.masked.gather.v4i32.v32p0(<4 x ptr> %ptrs, i32 8, <4 x i1> shufflevector(<4 x i1> insertelement(<4 x i1> poison, i1 true, i32 0), <4 x i1> poison, <4 x i32> zeroinitializer), <4 x i32> poison)
   ret <4 x i32> %x
 }
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
new file mode 100644
index 0000000..6c5dd04
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsadd-vp.ll
@@ -0,0 +1,1701 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsadd_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.sadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsadd_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsadd_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.sadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsadd_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsadd_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.sadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsadd_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsadd_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.sadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsadd_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsadd_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.sadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsadd_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsadd_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.sadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsadd_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsadd_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsadd_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsadd_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsadd_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.sadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsadd_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsadd_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.sadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsadd_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsadd_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.sadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsadd_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsadd_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.sadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsadd_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsadd_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.sadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsadd_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsadd_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.sadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsadd_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsadd_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.sadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsadd_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsadd_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.sadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsadd_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsadd_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.sadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsadd_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsadd_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.sadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsadd_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsadd_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.sadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsadd_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsadd_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.sadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsadd_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsadd_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.sadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsadd_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsadd.vi on RV32.
+
+define <32 x i64> @vsadd_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsadd_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsadd_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsadd.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.sadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
new file mode 100644
index 0000000..6227f8a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vsaddu-vp.ll
@@ -0,0 +1,1697 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vsaddu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.uadd.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vsaddu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vsaddu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.uadd.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vsaddu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vsaddu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.uadd.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vsaddu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vsaddu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.uadd.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vsaddu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vsaddu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.uadd.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vsaddu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vsaddu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.uadd.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vsaddu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vsaddu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vsaddu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vsaddu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.uadd.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vsaddu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vsaddu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.uadd.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vsaddu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vsaddu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.uadd.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vsaddu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vsaddu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.uadd.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vsaddu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vsaddu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.uadd.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vsaddu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vsaddu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.uadd.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vsaddu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vsaddu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.uadd.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vsaddu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vsaddu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.uadd.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vsaddu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vsaddu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.uadd.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vsaddu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vsaddu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.uadd.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vsaddu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vsaddu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.uadd.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vsaddu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vsaddu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.uadd.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vsaddu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vsaddu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.uadd.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vsaddu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v16, v16, -1
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vsaddu.vi on RV32.
+
+define <32 x i64> @vsaddu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vsaddu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vsaddu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vsaddu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.uadd.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
new file mode 100644
index 0000000..6360cf4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssub-vp.ll
@@ -0,0 +1,1745 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssub_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vadd.vv v9, v9, v9
+; CHECK-NEXT:    vsra.vi v9, v9, 1
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.ssub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssub_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssub_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.ssub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssub_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssub_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.ssub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssub_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssub_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.ssub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssub_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssub_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.ssub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssub_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssub_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.ssub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssub_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssub_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssub_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssub_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssub_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.ssub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssub_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssub_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.ssub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssub_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssub_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.ssub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssub_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssub_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.ssub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssub_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssub_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.ssub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssub_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssub_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.ssub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssub_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssub_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.ssub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssub_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssub_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.ssub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssub_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssub_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.ssub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssub_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssub_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.ssub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssub_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssub_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.ssub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssub_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssub_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.ssub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssub_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssub_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.ssub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssub_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssub.vi on RV32.
+
+define <32 x i64> @vssub_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssub_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssub_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssub.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssub.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.ssub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
new file mode 100644
index 0000000..6ea9758
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vssubu-vp.ll
@@ -0,0 +1,1740 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7>, <8 x i7>, <8 x i1>, i32)
+
+define <8 x i7> @vssubu_vv_v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 127
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i7> @llvm.vp.usub.sat.v8i7(<8 x i7> %va, <8 x i7> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i7> %v
+}
+
+declare <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8>, <2 x i8>, <2 x i1>, i32)
+
+define <2 x i8> @vssubu_vv_v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vv_v2i8_unmasked(<2 x i8> %va, <2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8(<2 x i8> %va, i8 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vx_v2i8_unmasked(<2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8(<2 x i8> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+define <2 x i8> @vssubu_vi_v2i8_unmasked(<2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <2 x i8> %elt.head, <2 x i8> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i8> @llvm.vp.usub.sat.v2i8(<2 x i8> %va, <2 x i8> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i8> %v
+}
+
+declare <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8>, <4 x i8>, <4 x i1>, i32)
+
+define <4 x i8> @vssubu_vv_v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vv_v4i8_unmasked(<4 x i8> %va, <4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_commute(<4 x i8> %va, i8 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %vb, <4 x i8> %va, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vx_v4i8_unmasked(<4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8(<4 x i8> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+define <4 x i8> @vssubu_vi_v4i8_unmasked(<4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <4 x i8> %elt.head, <4 x i8> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i8> @llvm.vp.usub.sat.v4i8(<4 x i8> %va, <4 x i8> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i8> %v
+}
+
+declare <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8>, <5 x i8>, <5 x i1>, i32)
+
+define <5 x i8> @vssubu_vv_v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vv_v5i8_unmasked(<5 x i8> %va, <5 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %b, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8(<5 x i8> %va, i8 %b, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vx_v5i8_unmasked(<5 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8(<5 x i8> %va, <5 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+define <5 x i8> @vssubu_vi_v5i8_unmasked(<5 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v5i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <5 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <5 x i8> %elt.head, <5 x i8> poison, <5 x i32> zeroinitializer
+  %head = insertelement <5 x i1> poison, i1 true, i32 0
+  %m = shufflevector <5 x i1> %head, <5 x i1> poison, <5 x i32> zeroinitializer
+  %v = call <5 x i8> @llvm.vp.usub.sat.v5i8(<5 x i8> %va, <5 x i8> %vb, <5 x i1> %m, i32 %evl)
+  ret <5 x i8> %v
+}
+
+declare <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8>, <8 x i8>, <8 x i1>, i32)
+
+define <8 x i8> @vssubu_vv_v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vv_v8i8_unmasked(<8 x i8> %va, <8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8(<8 x i8> %va, i8 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vx_v8i8_unmasked(<8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8(<8 x i8> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+define <8 x i8> @vssubu_vi_v8i8_unmasked(<8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <8 x i8> %elt.head, <8 x i8> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i8> @llvm.vp.usub.sat.v8i8(<8 x i8> %va, <8 x i8> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i8> %v
+}
+
+declare <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8>, <16 x i8>, <16 x i1>, i32)
+
+define <16 x i8> @vssubu_vv_v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vv_v16i8_unmasked(<16 x i8> %va, <16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8(<16 x i8> %va, i8 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vx_v16i8_unmasked(<16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8(<16 x i8> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+define <16 x i8> @vssubu_vi_v16i8_unmasked(<16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <16 x i8> %elt.head, <16 x i8> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i8> @llvm.vp.usub.sat.v16i8(<16 x i8> %va, <16 x i8> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i8> %v
+}
+
+declare <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8>, <256 x i8>, <256 x i1>, i32)
+
+define <256 x i8> @vssubu_vi_v258i8(<256 x i8> %va, <256 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    addi a0, a1, -128
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB32_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB32_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+define <256 x i8> @vssubu_vi_v258i8_unmasked(<256 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v258i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 128
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:    bltu a0, a2, .LBB33_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:  .LBB33_2:
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    addi a1, a0, -128
+; CHECK-NEXT:    sltu a0, a0, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %head = insertelement <256 x i1> poison, i1 true, i32 0
+  %m = shufflevector <256 x i1> %head, <256 x i1> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 %evl)
+  ret <256 x i8> %v
+}
+
+; Test splitting when the %evl is a known constant.
+
+define <256 x i8> @vssubu_vi_v258i8_evl129(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl129:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 129)
+  ret <256 x i8> %v
+}
+
+; FIXME: The upper half is doing nothing.
+
+define <256 x i8> @vssubu_vi_v258i8_evl128(<256 x i8> %va, <256 x i1> %m) {
+; CHECK-LABEL: vssubu_vi_v258i8_evl128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 128
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v24, (a0)
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    vsetivli zero, 0, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <256 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <256 x i8> %elt.head, <256 x i8> poison, <256 x i32> zeroinitializer
+  %v = call <256 x i8> @llvm.vp.usub.sat.v258i8(<256 x i8> %va, <256 x i8> %vb, <256 x i1> %m, i32 128)
+  ret <256 x i8> %v
+}
+
+declare <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16>, <2 x i16>, <2 x i1>, i32)
+
+define <2 x i16> @vssubu_vv_v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vv_v2i16_unmasked(<2 x i16> %va, <2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16(<2 x i16> %va, i16 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vx_v2i16_unmasked(<2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16(<2 x i16> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+define <2 x i16> @vssubu_vi_v2i16_unmasked(<2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <2 x i16> %elt.head, <2 x i16> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i16> @llvm.vp.usub.sat.v2i16(<2 x i16> %va, <2 x i16> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i16> %v
+}
+
+declare <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16>, <4 x i16>, <4 x i1>, i32)
+
+define <4 x i16> @vssubu_vv_v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vv_v4i16_unmasked(<4 x i16> %va, <4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16(<4 x i16> %va, i16 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vx_v4i16_unmasked(<4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16(<4 x i16> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+define <4 x i16> @vssubu_vi_v4i16_unmasked(<4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <4 x i16> %elt.head, <4 x i16> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i16> @llvm.vp.usub.sat.v4i16(<4 x i16> %va, <4 x i16> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i16> %v
+}
+
+declare <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16>, <8 x i16>, <8 x i1>, i32)
+
+define <8 x i16> @vssubu_vv_v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vv_v8i16_unmasked(<8 x i16> %va, <8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16(<8 x i16> %va, i16 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vx_v8i16_unmasked(<8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16(<8 x i16> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+define <8 x i16> @vssubu_vi_v8i16_unmasked(<8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <8 x i16> %elt.head, <8 x i16> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i16> @llvm.vp.usub.sat.v8i16(<8 x i16> %va, <8 x i16> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i16> %v
+}
+
+declare <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16>, <16 x i16>, <16 x i1>, i32)
+
+define <16 x i16> @vssubu_vv_v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vv_v16i16_unmasked(<16 x i16> %va, <16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16(<16 x i16> %va, i16 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vx_v16i16_unmasked(<16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16(<16 x i16> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+define <16 x i16> @vssubu_vi_v16i16_unmasked(<16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <16 x i16> %elt.head, <16 x i16> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i16> @llvm.vp.usub.sat.v16i16(<16 x i16> %va, <16 x i16> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i16> %v
+}
+
+declare <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32>, <2 x i32>, <2 x i1>, i32)
+
+define <2 x i32> @vssubu_vv_v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vv_v2i32_unmasked(<2 x i32> %va, <2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32(<2 x i32> %va, i32 %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vx_v2i32_unmasked(<2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32(<2 x i32> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+define <2 x i32> @vssubu_vi_v2i32_unmasked(<2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <2 x i32> %elt.head, <2 x i32> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i32> @llvm.vp.usub.sat.v2i32(<2 x i32> %va, <2 x i32> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i32> %v
+}
+
+declare <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32)
+
+define <4 x i32> @vssubu_vv_v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vv_v4i32_unmasked(<4 x i32> %va, <4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32(<4 x i32> %va, i32 %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vx_v4i32_unmasked(<4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32(<4 x i32> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+define <4 x i32> @vssubu_vi_v4i32_unmasked(<4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <4 x i32> %elt.head, <4 x i32> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i32> @llvm.vp.usub.sat.v4i32(<4 x i32> %va, <4 x i32> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i32> %v
+}
+
+declare <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+
+define <8 x i32> @vssubu_vv_v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vv_v8i32_unmasked(<8 x i32> %va, <8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32(<8 x i32> %va, i32 %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vx_v8i32_unmasked(<8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32(<8 x i32> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+define <8 x i32> @vssubu_vi_v8i32_unmasked(<8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <8 x i32> %elt.head, <8 x i32> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i32> @llvm.vp.usub.sat.v8i32(<8 x i32> %va, <8 x i32> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i32> %v
+}
+
+declare <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32>, <16 x i32>, <16 x i1>, i32)
+
+define <16 x i32> @vssubu_vv_v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vv_v16i32_unmasked(<16 x i32> %va, <16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32(<16 x i32> %va, i32 %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vx_v16i32_unmasked(<16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32(<16 x i32> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+define <16 x i32> @vssubu_vi_v16i32_unmasked(<16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <16 x i32> %elt.head, <16 x i32> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i32> @llvm.vp.usub.sat.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i32> %v
+}
+
+declare <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64>, <2 x i64>, <2 x i1>, i32)
+
+define <2 x i64> @vssubu_vv_v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vv_v2i64_unmasked(<2 x i64> %va, <2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %b, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64(<2 x i64> %va, i64 %b, <2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vx_v2i64_unmasked(<2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64(<2 x i64> %va, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+define <2 x i64> @vssubu_vi_v2i64_unmasked(<2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <2 x i64> %elt.head, <2 x i64> poison, <2 x i32> zeroinitializer
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x i64> @llvm.vp.usub.sat.v2i64(<2 x i64> %va, <2 x i64> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x i64> %v
+}
+
+declare <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64>, <4 x i64>, <4 x i1>, i32)
+
+define <4 x i64> @vssubu_vv_v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vv_v4i64_unmasked(<4 x i64> %va, <4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %b, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64(<4 x i64> %va, i64 %b, <4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vx_v4i64_unmasked(<4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64(<4 x i64> %va, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+define <4 x i64> @vssubu_vi_v4i64_unmasked(<4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <4 x i64> %elt.head, <4 x i64> poison, <4 x i32> zeroinitializer
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x i64> @llvm.vp.usub.sat.v4i64(<4 x i64> %va, <4 x i64> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x i64> %v
+}
+
+declare <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64>, <8 x i64>, <8 x i1>, i32)
+
+define <8 x i64> @vssubu_vv_v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vv_v8i64_unmasked(<8 x i64> %va, <8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %b, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64(<8 x i64> %va, i64 %b, <8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vx_v8i64_unmasked(<8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64(<8 x i64> %va, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+define <8 x i64> @vssubu_vi_v8i64_unmasked(<8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <8 x i64> %elt.head, <8 x i64> poison, <8 x i32> zeroinitializer
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x i64> @llvm.vp.usub.sat.v8i64(<8 x i64> %va, <8 x i64> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x i64> %v
+}
+
+declare <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64>, <16 x i64>, <16 x i1>, i32)
+
+define <16 x i64> @vssubu_vv_v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vv_v16i64_unmasked(<16 x i64> %va, <16 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %b, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64(<16 x i64> %va, i64 %b, <16 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vx_v16i64_unmasked(<16 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v16i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v16i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64(<16 x i64> %va, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+define <16 x i64> @vssubu_vi_v16i64_unmasked(<16 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_v16i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <16 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <16 x i64> %elt.head, <16 x i64> poison, <16 x i32> zeroinitializer
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x i64> @llvm.vp.usub.sat.v16i64(<16 x i64> %va, <16 x i64> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x i64> %v
+}
+
+; Test that split-legalization works as expected.
+
+declare <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64>, <32 x i64>, <32 x i1>, i32)
+
+define <32 x i64> @vssubu_vx_v32i64(<32 x i64> %va, <32 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_v32i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB108_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB108_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB108_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB108_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2, v0.t
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a2, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vi_v32i64_unmasked(<32 x i64> %va, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vi_v32i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a1, 32
+; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; RV32-NEXT:    li a2, 16
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    mv a1, a0
+; RV32-NEXT:    bltu a0, a2, .LBB109_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    li a1, 16
+; RV32-NEXT:  .LBB109_2:
+; RV32-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24
+; RV32-NEXT:    addi a1, a0, -16
+; RV32-NEXT:    sltu a0, a0, a1
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v16, v16, v24
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vi_v32i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a2, 16
+; RV64-NEXT:    mv a1, a0
+; RV64-NEXT:    bltu a0, a2, .LBB109_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    li a1, 16
+; RV64-NEXT:  .LBB109_2:
+; RV64-NEXT:    li a2, -1
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a2
+; RV64-NEXT:    addi a1, a0, -16
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v16, v16, a2
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x i64> %v
+}
+
+; FIXME: We don't match vssubu.vi on RV32.
+
+define <32 x i64> @vssubu_vx_v32i64_evl12(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl12:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl12:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 12, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 0, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 12)
+  ret <32 x i64> %v
+}
+
+define <32 x i64> @vssubu_vx_v32i64_evl27(<32 x i64> %va, <32 x i1> %m) {
+; RV32-LABEL: vssubu_vx_v32i64_evl27:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV32-NEXT:    vslidedown.vi v1, v0, 2
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v24, -1
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v24, v0.t
+; RV32-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vssubu.vv v16, v16, v24, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_v32i64_evl27:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; RV64-NEXT:    vslidedown.vi v24, v0, 2
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    vsetivli zero, 11, e64, m8, ta, ma
+; RV64-NEXT:    vmv1r.v v0, v24
+; RV64-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <32 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <32 x i64> %elt.head, <32 x i64> poison, <32 x i32> zeroinitializer
+  %v = call <32 x i64> @llvm.vp.usub.sat.v32i64(<32 x i64> %va, <32 x i64> %vb, <32 x i1> %m, i32 27)
+  ret <32 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
index 574c265..a084b53 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll
@@ -385,12 +385,12 @@ define <32 x i64> @vwsubu_v32i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -899,12 +899,12 @@ define <2 x i64> @vwsubu_vx_v2i64_i64(ptr %x, ptr %y) nounwind {
 define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i32_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -917,12 +917,12 @@ define <2 x i32> @vwsubu_v2i32_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    vle8.v v9, (a1)
-; CHECK-NEXT:    vzext.vf4 v10, v8
-; CHECK-NEXT:    vzext.vf4 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf4 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i8>, ptr %x
   %b = load <2 x i8>, ptr %y
@@ -935,12 +935,12 @@ define <2 x i64> @vwsubu_v2i64_of_v2i8(ptr %x, ptr %y) {
 define <2 x i64> @vwsubu_v2i64_of_v2i16(ptr %x, ptr %y) {
 ; CHECK-LABEL: vwsubu_v2i64_of_v2i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
 ; CHECK-NEXT:    vle16.v v9, (a1)
-; CHECK-NEXT:    vzext.vf2 v10, v8
-; CHECK-NEXT:    vzext.vf2 v11, v9
-; CHECK-NEXT:    vwsubu.vv v8, v10, v11
+; CHECK-NEXT:    vwsubu.vv v10, v8, v9
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsext.vf2 v8, v10
 ; CHECK-NEXT:    ret
   %a = load <2 x i16>, ptr %x
   %b = load <2 x i16>, ptr %y
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
new file mode 100644
index 0000000..caaeae5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsadd-vp.ll
@@ -0,0 +1,2015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsadd_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.sadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsadd_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.sadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsadd_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.sadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsadd_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.sadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsadd_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.sadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsadd_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.sadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsadd_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.sadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsadd_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.sadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsadd_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.sadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsadd_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.sadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsadd_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.sadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsadd_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.sadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsadd_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.sadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsadd_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.sadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsadd_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.sadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsadd_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.sadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsadd_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.sadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsadd_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.sadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsadd_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.sadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsadd_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.sadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsadd_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.sadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsadd_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.sadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsadd_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.sadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsadd_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.sadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsadd_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.sadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsadd.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsadd_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsadd.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsadd_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsadd_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsadd.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.sadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
new file mode 100644
index 0000000..c0779e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vsaddu-vp.ll
@@ -0,0 +1,2014 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vsaddu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    vminu.vx v8, v8, a2, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.uadd.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vsaddu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.uadd.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vsaddu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.uadd.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vsaddu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.uadd.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vsaddu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.uadd.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vsaddu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.uadd.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vsaddu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.uadd.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vsaddu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.uadd.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vsaddu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.uadd.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub a2, a1, a0
+; CHECK-NEXT:    sltu a3, a1, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a1, a0, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a0
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vsaddu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.uadd.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vsaddu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.uadd.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vsaddu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.uadd.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vsaddu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.uadd.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vsaddu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.uadd.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vsaddu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.uadd.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vsaddu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.uadd.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vsaddu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.uadd.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vsaddu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.uadd.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vsaddu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.uadd.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vsaddu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.uadd.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vsaddu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.uadd.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1, v0.t
+; CHECK-NEXT:    bltu a0, a1, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vsaddu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a2, a3, a2
+; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v16, v16, -1
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.uadd.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vsaddu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.uadd.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vsaddu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.uadd.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vsaddu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.uadd.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vsaddu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vsaddu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vsaddu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vsaddu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vsaddu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vsaddu.vi v8, v8, -1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.uadd.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
new file mode 100644
index 0000000..2d51a2e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssub-vp.ll
@@ -0,0 +1,2067 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssub_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v8, v8
+; CHECK-NEXT:    vsra.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 63
+; CHECK-NEXT:    vmin.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmax.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.ssub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssub_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.ssub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssub_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.ssub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssub_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.ssub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssub_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.ssub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssub_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.ssub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssub_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.ssub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssub_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.ssub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssub_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.ssub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssub_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.ssub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssub_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.ssub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssub_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.ssub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssub_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.ssub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssub_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.ssub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssub_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.ssub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssub_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.ssub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssub_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.ssub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssub_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.ssub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssub_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.ssub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssub_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.ssub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssub_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.ssub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssub_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.ssub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssub_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.ssub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssub_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.ssub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssub_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.ssub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssub.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssub_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssub.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssub_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssub_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssub.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.ssub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
new file mode 100644
index 0000000..e5589ce
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vssubu-vp.ll
@@ -0,0 +1,2065 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7>, <vscale x 8 x i7>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i7> @vssubu_vx_nxv8i7(<vscale x 8 x i7> %a, i7 signext %b, <vscale x 8 x i1> %mask, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 127
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i7> poison, i7 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i7> %elt.head, <vscale x 8 x i7> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i7> @llvm.vp.usub.sat.nxv8i7(<vscale x 8 x i7> %a, <vscale x 8 x i7> %vb, <vscale x 8 x i1> %mask, i32 %evl)
+  ret <vscale x 8 x i7> %v
+}
+
+declare <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vv_nxv1i8_unmasked(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_commute(<vscale x 1 x i8> %va, i8 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_commute:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli a2, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v9, v8, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %vb, <vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vx_nxv1i8_unmasked(<vscale x 1 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+define <vscale x 1 x i8> @vssubu_vi_nxv1i8_unmasked(<vscale x 1 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i8> @llvm.vp.usub.sat.nxv1i8(<vscale x 1 x i8> %va, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i8> %v
+}
+
+declare <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vv_nxv2i8_unmasked(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8(<vscale x 2 x i8> %va, i8 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vx_nxv2i8_unmasked(<vscale x 2 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+define <vscale x 2 x i8> @vssubu_vi_nxv2i8_unmasked(<vscale x 2 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i8> @llvm.vp.usub.sat.nxv2i8(<vscale x 2 x i8> %va, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i8> %v
+}
+
+declare <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8>, <vscale x 3 x i8>, <vscale x 3 x i1>, i32)
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vv_nxv3i8_unmasked(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %b, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8(<vscale x 3 x i8> %va, i8 %b, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vx_nxv3i8_unmasked(<vscale x 3 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+define <vscale x 3 x i8> @vssubu_vi_nxv3i8_unmasked(<vscale x 3 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv3i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 3 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 3 x i8> %elt.head, <vscale x 3 x i8> poison, <vscale x 3 x i32> zeroinitializer
+  %head = insertelement <vscale x 3 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 3 x i1> %head, <vscale x 3 x i1> poison, <vscale x 3 x i32> zeroinitializer
+  %v = call <vscale x 3 x i8> @llvm.vp.usub.sat.nxv3i8(<vscale x 3 x i8> %va, <vscale x 3 x i8> %vb, <vscale x 3 x i1> %m, i32 %evl)
+  ret <vscale x 3 x i8> %v
+}
+
+declare <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vv_nxv4i8_unmasked(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8(<vscale x 4 x i8> %va, i8 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vx_nxv4i8_unmasked(<vscale x 4 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+define <vscale x 4 x i8> @vssubu_vi_nxv4i8_unmasked(<vscale x 4 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i8> @llvm.vp.usub.sat.nxv4i8(<vscale x 4 x i8> %va, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i8> %v
+}
+
+declare <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vv_nxv8i8_unmasked(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vx_nxv8i8_unmasked(<vscale x 8 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 8 x i8> @vssubu_vi_nxv8i8_unmasked(<vscale x 8 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.usub.sat.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+declare <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vv_nxv16i8_unmasked(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8(<vscale x 16 x i8> %va, i8 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vx_nxv16i8_unmasked(<vscale x 16 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @vssubu_vi_nxv16i8_unmasked(<vscale x 16 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i8> @llvm.vp.usub.sat.nxv16i8(<vscale x 16 x i8> %va, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i8> %v
+}
+
+declare <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vv_nxv32i8_unmasked(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8(<vscale x 32 x i8> %va, i8 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vx_nxv32i8_unmasked(<vscale x 32 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+define <vscale x 32 x i8> @vssubu_vi_nxv32i8_unmasked(<vscale x 32 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i8> @llvm.vp.usub.sat.nxv32i8(<vscale x 32 x i8> %va, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i8> %v
+}
+
+declare <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vv_nxv64i8_unmasked(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %b, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8(<vscale x 64 x i8> %va, i8 %b, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vx_nxv64i8_unmasked(<vscale x 64 x i8> %va, i8 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+define <vscale x 64 x i8> @vssubu_vi_nxv64i8_unmasked(<vscale x 64 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %head = insertelement <vscale x 64 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 64 x i1> %head, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %v = call <vscale x 64 x i8> @llvm.vp.usub.sat.nxv64i8(<vscale x 64 x i8> %va, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %m, i32 %evl)
+  ret <vscale x 64 x i8> %v
+}
+
+; Test that split-legalization works when the mask itself needs splitting.
+
+declare <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8>, <vscale x 128 x i8>, <vscale x 128 x i1>, i32)
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m8, ta, ma
+; CHECK-NEXT:    vlm.v v0, (a0)
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    slli a2, a2, 3
+; CHECK-NEXT:    sub a0, a1, a2
+; CHECK-NEXT:    sltu a3, a1, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a0
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a0, v0.t
+; CHECK-NEXT:    bltu a1, a2, .LBB50_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a1, a2
+; CHECK-NEXT:  .LBB50_2:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+define <vscale x 128 x i8> @vssubu_vi_nxv128i8_unmasked(<vscale x 128 x i8> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv128i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB51_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB51_2:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 128 x i8> poison, i8 -1, i32 0
+  %vb = shufflevector <vscale x 128 x i8> %elt.head, <vscale x 128 x i8> poison, <vscale x 128 x i32> zeroinitializer
+  %head = insertelement <vscale x 128 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 128 x i1> %head, <vscale x 128 x i1> poison, <vscale x 128 x i32> zeroinitializer
+  %v = call <vscale x 128 x i8> @llvm.vp.usub.sat.nxv128i8(<vscale x 128 x i8> %va, <vscale x 128 x i8> %vb, <vscale x 128 x i1> %m, i32 %evl)
+  ret <vscale x 128 x i8> %v
+}
+
+declare <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vv_nxv1i16_unmasked(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16(<vscale x 1 x i16> %va, i16 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vx_nxv1i16_unmasked(<vscale x 1 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+define <vscale x 1 x i16> @vssubu_vi_nxv1i16_unmasked(<vscale x 1 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i16> @llvm.vp.usub.sat.nxv1i16(<vscale x 1 x i16> %va, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i16> %v
+}
+
+declare <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vv_nxv2i16_unmasked(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16(<vscale x 2 x i16> %va, i16 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vx_nxv2i16_unmasked(<vscale x 2 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+define <vscale x 2 x i16> @vssubu_vi_nxv2i16_unmasked(<vscale x 2 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i16> @llvm.vp.usub.sat.nxv2i16(<vscale x 2 x i16> %va, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i16> %v
+}
+
+declare <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vv_nxv4i16_unmasked(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vx_nxv4i16_unmasked(<vscale x 4 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 4 x i16> @vssubu_vi_nxv4i16_unmasked(<vscale x 4 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.usub.sat.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+declare <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vv_nxv8i16_unmasked(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16(<vscale x 8 x i16> %va, i16 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vx_nxv8i16_unmasked(<vscale x 8 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+define <vscale x 8 x i16> @vssubu_vi_nxv8i16_unmasked(<vscale x 8 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i16> @llvm.vp.usub.sat.nxv8i16(<vscale x 8 x i16> %va, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i16> %v
+}
+
+declare <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vv_nxv16i16_unmasked(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16(<vscale x 16 x i16> %va, i16 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vx_nxv16i16_unmasked(<vscale x 16 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+define <vscale x 16 x i16> @vssubu_vi_nxv16i16_unmasked(<vscale x 16 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i16> @llvm.vp.usub.sat.nxv16i16(<vscale x 16 x i16> %va, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i16> %v
+}
+
+declare <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vv_nxv32i16_unmasked(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %b, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16(<vscale x 32 x i16> %va, i16 %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vx_nxv32i16_unmasked(<vscale x 32 x i16> %va, i16 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+define <vscale x 32 x i16> @vssubu_vi_nxv32i16_unmasked(<vscale x 32 x i16> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i16> @llvm.vp.usub.sat.nxv32i16(<vscale x 32 x i16> %va, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i16> %v
+}
+
+declare <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vv_nxv1i32_unmasked(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32(<vscale x 1 x i32> %va, i32 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vx_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+define <vscale x 1 x i32> @vssubu_vi_nxv1i32_unmasked(<vscale x 1 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i32> @llvm.vp.usub.sat.nxv1i32(<vscale x 1 x i32> %va, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i32> %v
+}
+
+declare <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vv_nxv2i32_unmasked(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vx_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 2 x i32> @vssubu_vi_nxv2i32_unmasked(<vscale x 2 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.usub.sat.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+declare <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vv_nxv4i32_unmasked(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32(<vscale x 4 x i32> %va, i32 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vx_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+define <vscale x 4 x i32> @vssubu_vi_nxv4i32_unmasked(<vscale x 4 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i32> @llvm.vp.usub.sat.nxv4i32(<vscale x 4 x i32> %va, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i32> %v
+}
+
+declare <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vv_nxv8i32_unmasked(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32(<vscale x 8 x i32> %va, i32 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vx_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+define <vscale x 8 x i32> @vssubu_vi_nxv8i32_unmasked(<vscale x 8 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i32> @llvm.vp.usub.sat.nxv8i32(<vscale x 8 x i32> %va, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i32> %v
+}
+
+declare <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vv_nxv16i32_unmasked(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %b, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32(<vscale x 16 x i32> %va, i32 %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vx_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @vssubu_vi_nxv16i32_unmasked(<vscale x 16 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x i32> @llvm.vp.usub.sat.nxv16i32(<vscale x 16 x i32> %va, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x i32> %v
+}
+
+; Test that split-legalization works then the mask needs manual splitting.
+
+declare <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v24, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a2, a2, 1
+; CHECK-NEXT:    sub a1, a0, a2
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a1
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a1, v0.t
+; CHECK-NEXT:    bltu a0, a2, .LBB118_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:  .LBB118_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+define <vscale x 32 x i32> @vssubu_vi_nxv32i32_unmasked(<vscale x 32 x i32> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv32i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    sltu a3, a0, a2
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a3, a3, a2
+; CHECK-NEXT:    li a2, -1
+; CHECK-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v16, v16, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB119_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a0, a1
+; CHECK-NEXT:  .LBB119_2:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a2
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i32> poison, i32 -1, i32 0
+  %vb = shufflevector <vscale x 32 x i32> %elt.head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x i32> @llvm.vp.usub.sat.nxv32i32(<vscale x 32 x i32> %va, <vscale x 32 x i32> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x i32> %v
+}
+
+declare <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vv_nxv1i64_unmasked(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v9
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %b, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vx_nxv1i64_unmasked(<vscale x 1 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v9, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v9
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 1 x i64> @vssubu_vi_nxv1i64_unmasked(<vscale x 1 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.usub.sat.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+declare <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vv_nxv2i64_unmasked(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v10
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %b, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64(<vscale x 2 x i64> %va, i64 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vx_nxv2i64_unmasked(<vscale x 2 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+define <vscale x 2 x i64> @vssubu_vi_nxv2i64_unmasked(<vscale x 2 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i64> @llvm.vp.usub.sat.nxv2i64(<vscale x 2 x i64> %va, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i64> %v
+}
+
+declare <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vv_nxv4i64_unmasked(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v12
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %b, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64(<vscale x 4 x i64> %va, i64 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vx_nxv4i64_unmasked(<vscale x 4 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+define <vscale x 4 x i64> @vssubu_vi_nxv4i64_unmasked(<vscale x 4 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i64> @llvm.vp.usub.sat.nxv4i64(<vscale x 4 x i64> %va, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i64> %v
+}
+
+declare <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vv_nxv8i64_unmasked(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vv v8, v8, v16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %b, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64(<vscale x 8 x i64> %va, i64 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16, v0.t
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vx_nxv8i64_unmasked(<vscale x 8 x i64> %va, i64 %b, i32 zeroext %evl) {
+; RV32-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vssubu.vv v8, v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vssubu_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vssubu.vx v8, v8, a0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
+
+define <vscale x 8 x i64> @vssubu_vi_nxv8i64_unmasked(<vscale x 8 x i64> %va, i32 zeroext %evl) {
+; CHECK-LABEL: vssubu_vi_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -1
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vssubu.vx v8, v8, a1
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 -1, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i64> @llvm.vp.usub.sat.nxv8i64(<vscale x 8 x i64> %va, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i64> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/select.ll b/llvm/test/CodeGen/RISCV/select.ll
index e01984b..e07e520 100644
--- a/llvm/test/CodeGen/RISCV/select.ll
+++ b/llvm/test/CodeGen/RISCV/select.ll
@@ -1606,23 +1606,255 @@ define i32 @select_cst_unknown(i32 signext %a, i32 signext %b) {
 ; RV64IMXVTCONDOPS-LABEL: select_cst_unknown:
 ; RV64IMXVTCONDOPS:       # %bb.0:
 ; RV64IMXVTCONDOPS-NEXT:    slt a0, a0, a1
-; RV64IMXVTCONDOPS-NEXT:    li a1, -7
-; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a1, a1, a0
-; RV64IMXVTCONDOPS-NEXT:    li a2, 5
-; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a2, a0
-; RV64IMXVTCONDOPS-NEXT:    or a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    li a1, -12
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 5
 ; RV64IMXVTCONDOPS-NEXT:    ret
 ;
 ; CHECKZICOND-LABEL: select_cst_unknown:
 ; CHECKZICOND:       # %bb.0:
 ; CHECKZICOND-NEXT:    slt a0, a0, a1
-; CHECKZICOND-NEXT:    li a1, -7
-; CHECKZICOND-NEXT:    czero.nez a1, a1, a0
-; CHECKZICOND-NEXT:    li a2, 5
-; CHECKZICOND-NEXT:    czero.eqz a0, a2, a0
-; CHECKZICOND-NEXT:    or a0, a0, a1
+; CHECKZICOND-NEXT:    li a1, -12
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 5
 ; CHECKZICOND-NEXT:    ret
   %cond = icmp slt i32 %a, %b
   %ret = select i1 %cond, i32 5, i32 -7
   ret i32 %ret
 }
+
+define i32 @select_cst1(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst1:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB43_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 20
+; RV32IM-NEXT:  .LBB43_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst1:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB43_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 20
+; RV64IM-NEXT:  .LBB43_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst1:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 10
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst1:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 10
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 10
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20
+  ret i32 %ret
+}
+
+define i32 @select_cst2(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst2:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 10
+; RV32IM-NEXT:    bnez a1, .LBB44_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:  .LBB44_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 10
+; RV64IM-NEXT:    bnez a1, .LBB44_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:  .LBB44_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst2:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 5
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -490
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 10
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst2:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 5
+; RV32IMZICOND-NEXT:    addi a1, a1, -490
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    addi a0, a0, 10
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst2:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 5
+; RV64IMZICOND-NEXT:    addiw a1, a1, -490
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    addi a0, a0, 10
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 10, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst3(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst3:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB45_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 5
+; RV32IM-NEXT:    addi a0, a0, -480
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB45_2:
+; RV32IM-NEXT:    lui a0, 7
+; RV32IM-NEXT:    addi a0, a0, 1328
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst3:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB45_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 5
+; RV64IM-NEXT:    addiw a0, a0, -480
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB45_2:
+; RV64IM-NEXT:    lui a0, 7
+; RV64IM-NEXT:    addiw a0, a0, 1328
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst3:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 1048574
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, -1808
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    lui a1, 7
+; RV64IMXVTCONDOPS-NEXT:    addiw a1, a1, 1328
+; RV64IMXVTCONDOPS-NEXT:    add a0, a0, a1
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; RV32IMZICOND-LABEL: select_cst3:
+; RV32IMZICOND:       # %bb.0:
+; RV32IMZICOND-NEXT:    lui a1, 1048574
+; RV32IMZICOND-NEXT:    addi a1, a1, -1808
+; RV32IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV32IMZICOND-NEXT:    lui a1, 7
+; RV32IMZICOND-NEXT:    addi a1, a1, 1328
+; RV32IMZICOND-NEXT:    add a0, a0, a1
+; RV32IMZICOND-NEXT:    ret
+;
+; RV64IMZICOND-LABEL: select_cst3:
+; RV64IMZICOND:       # %bb.0:
+; RV64IMZICOND-NEXT:    lui a1, 1048574
+; RV64IMZICOND-NEXT:    addiw a1, a1, -1808
+; RV64IMZICOND-NEXT:    czero.nez a0, a1, a0
+; RV64IMZICOND-NEXT:    lui a1, 7
+; RV64IMZICOND-NEXT:    addiw a1, a1, 1328
+; RV64IMZICOND-NEXT:    add a0, a0, a1
+; RV64IMZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 30000, i32 20000
+  ret i32 %ret
+}
+
+define i32 @select_cst4(i1 zeroext %cond) {
+; CHECK-LABEL: select_cst4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    xori a0, a0, 2047
+; CHECK-NEXT:    ret
+  %ret = select i1 %cond, i32 -2048, i32 2047
+  ret i32 %ret
+}
+
+define i32 @select_cst5(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst5:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    mv a1, a0
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    bnez a1, .LBB47_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:  .LBB47_2:
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst5:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    mv a1, a0
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    bnez a1, .LBB47_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:  .LBB47_2:
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst5:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskcn a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst5:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.nez a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2047, i32 2049
+  ret i32 %ret
+}
+
+define i32 @select_cst6(i1 zeroext %cond) {
+; RV32IM-LABEL: select_cst6:
+; RV32IM:       # %bb.0:
+; RV32IM-NEXT:    bnez a0, .LBB48_2
+; RV32IM-NEXT:  # %bb.1:
+; RV32IM-NEXT:    li a0, 2047
+; RV32IM-NEXT:    ret
+; RV32IM-NEXT:  .LBB48_2:
+; RV32IM-NEXT:    lui a0, 1
+; RV32IM-NEXT:    addi a0, a0, -2047
+; RV32IM-NEXT:    ret
+;
+; RV64IM-LABEL: select_cst6:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    bnez a0, .LBB48_2
+; RV64IM-NEXT:  # %bb.1:
+; RV64IM-NEXT:    li a0, 2047
+; RV64IM-NEXT:    ret
+; RV64IM-NEXT:  .LBB48_2:
+; RV64IM-NEXT:    lui a0, 1
+; RV64IM-NEXT:    addiw a0, a0, -2047
+; RV64IM-NEXT:    ret
+;
+; RV64IMXVTCONDOPS-LABEL: select_cst6:
+; RV64IMXVTCONDOPS:       # %bb.0:
+; RV64IMXVTCONDOPS-NEXT:    li a1, 2
+; RV64IMXVTCONDOPS-NEXT:    vt.maskc a0, a1, a0
+; RV64IMXVTCONDOPS-NEXT:    addi a0, a0, 2047
+; RV64IMXVTCONDOPS-NEXT:    ret
+;
+; CHECKZICOND-LABEL: select_cst6:
+; CHECKZICOND:       # %bb.0:
+; CHECKZICOND-NEXT:    li a1, 2
+; CHECKZICOND-NEXT:    czero.eqz a0, a1, a0
+; CHECKZICOND-NEXT:    addi a0, a0, 2047
+; CHECKZICOND-NEXT:    ret
+  %ret = select i1 %cond, i32 2049, i32 2047
+  ret i32 %ret
+}
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index a2a953c..87f2a63 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -871,3 +871,64 @@ define void @zext_nneg_dominating_icmp_i32_zeroext(i16 signext %0) {
 5:
   ret void
 }
+
+; The load is used extended and non-extended in the successor basic block. The
+; signed compare will cause the non-extended value to exported out of the first
+; basic block using a sext to XLen. We need to CSE the zext nneg with the sext
+; so that we can form a sextload.
+define void @load_zext_nneg_sext_cse(ptr %p) nounwind {
+; RV32I-LABEL: load_zext_nneg_sext_cse:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    lh s0, 0(a0)
+; RV32I-NEXT:    bltz s0, .LBB50_2
+; RV32I-NEXT:  # %bb.1: # %bb1
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    call bar_i16
+; RV32I-NEXT:    mv a0, s0
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    tail bar_i32
+; RV32I-NEXT:  .LBB50_2: # %bb2
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    lw s0, 8(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: load_zext_nneg_sext_cse:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -16
+; RV64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64-NEXT:    lh s0, 0(a0)
+; RV64-NEXT:    bltz s0, .LBB50_2
+; RV64-NEXT:  # %bb.1: # %bb1
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    call bar_i16
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    tail bar_i32
+; RV64-NEXT:  .LBB50_2: # %bb2
+; RV64-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 16
+; RV64-NEXT:    ret
+  %load = load i16, ptr %p
+  %zext = zext nneg i16 %load to i32
+  %cmp = icmp sgt i16 %load, -1
+  br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+  tail call void @bar_i16(i16 signext %load)
+  tail call void @bar_i32(i32 signext %zext)
+  br label %bb2
+
+bb2:
+  ret void
+}
+declare void @bar_i16(i16);
diff --git a/llvm/test/CodeGen/RISCV/split-offsets.ll b/llvm/test/CodeGen/RISCV/split-offsets.ll
index fc35bc4..8d065da 100644
--- a/llvm/test/CodeGen/RISCV/split-offsets.ll
+++ b/llvm/test/CodeGen/RISCV/split-offsets.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV64I
 
 ; Check that memory accesses to array elements with large offsets have those
diff --git a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
index ec6e978..7fc4713 100644
--- a/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/srem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefix=RV64IM %s
 
 define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index eea8e64..540883fdc 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32I %s
-; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV32IM %s
-; RUN: llc -mtriple=riscv64 -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64I %s
-; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs -riscv-enable-sink-fold < %s \
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
 ; RUN:   | FileCheck -check-prefixes=CHECK,RV64IM %s
 
 
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
new file mode 100644
index 0000000..30c1635
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_usm_storage_classes/intel-usm-addrspaces.ll
@@ -0,0 +1,84 @@
+; Modified from: https://github.com/KhronosGroup/SPIRV-LLVM-Translator/test/extensions/INTEL/SPV_INTEL_usm_storage_classes/intel_usm_addrspaces.ll
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-EXT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_INTEL_usm_storage_classes %s -o - -filetype=obj | spirv-val %}
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV-WITHOUT
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-WITHOUT-NO: Capability USMStorageClassesINTEL
+; CHECK-SPIRV-EXT-DAG: %[[DevTy:[0-9]+]] = OpTypePointer DeviceOnlyINTEL %[[#]]
+; CHECK-SPIRV-EXT-DAG: %[[HostTy:[0-9]+]] = OpTypePointer HostOnlyINTEL %[[#]]
+; CHECK-SPIRV-DAG: %[[CrsWrkTy:[0-9]+]] = OpTypePointer CrossWorkgroup %[[#]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_kernel void @foo_kernel() {
+entry:
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr1:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr1:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[DevTy]] %[[Ptr1]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr1]]
+define spir_func void @test1(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(5)
+  store ptr addrspace(5) %casted_ptr, ptr %arg_dev.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV: %[[Ptr2:[0-9]+]] = OpLoad %[[CrsWrkTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr2:[0-9]+]] = OpCrossWorkgroupCastToPtrINTEL %[[HostTy]] %[[Ptr2]]
+; CHECK-SPIRV-WITHOUT-NOT: OpCrossWorkgroupCastToPtrINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr2]]
+define spir_func void @test2(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_glob = load ptr addrspace(1), ptr %arg_glob.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(1) %loaded_glob to ptr addrspace(6)
+  store ptr addrspace(6) %casted_ptr, ptr %arg_host.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr3:[0-9]+]] = OpLoad %[[DevTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr3:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr3]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr3]]
+define spir_func void @test3(ptr addrspace(1) %arg_glob, ptr addrspace(5) %arg_dev) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_dev.addr = alloca ptr addrspace(5), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(5) %arg_dev, ptr %arg_dev.addr, align 4
+  %loaded_dev = load ptr addrspace(5), ptr %arg_dev.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(5) %loaded_dev to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}
+
+; CHECK-SPIRV-EXT: %[[Ptr4:[0-9]+]] = OpLoad %[[HostTy]] %[[#]]
+; CHECK-SPIRV-EXT: %[[CastedPtr4:[0-9]+]] = OpPtrCastToCrossWorkgroupINTEL %[[CrsWrkTy]] %[[Ptr4]]
+; CHECK-SPIRV-WITHOUT-NOT: OpPtrCastToCrossWorkgroupINTEL
+; CHECK-SPIRV-EXT: OpStore %[[#]] %[[CastedPtr4]]
+define spir_func void @test4(ptr addrspace(1) %arg_glob, ptr addrspace(6) %arg_host) {
+entry:
+  %arg_glob.addr = alloca ptr addrspace(1), align 4
+  %arg_host.addr = alloca ptr addrspace(6), align 4
+  store ptr addrspace(1) %arg_glob, ptr %arg_glob.addr, align 4
+  store ptr addrspace(6) %arg_host, ptr %arg_host.addr, align 4
+  %loaded_host = load ptr addrspace(6), ptr %arg_host.addr, align 4
+  %casted_ptr = addrspacecast ptr addrspace(6) %loaded_host to ptr addrspace(1)
+  store ptr addrspace(1) %casted_ptr, ptr %arg_glob.addr, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
new file mode 100644
index 0000000..b1d6a09
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_subgroup_rotate/subgroup-rotate.ll
@@ -0,0 +1,357 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - | FileCheck %s
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown --spirv-extensions=SPV_KHR_subgroup_rotate %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-ERROR: LLVM ERROR: OpGroupNonUniformRotateKHR instruction requires the following SPIR-V extension: SPV_KHR_subgroup_rotate
+
+; CHECK: OpCapability GroupNonUniformRotateKHR
+; CHECK: OpExtension "SPV_KHR_subgroup_rotate"
+
+; CHECK-DAG: %[[TyInt8:.*]] = OpTypeInt 8 0
+; CHECK-DAG: %[[TyInt16:.*]] = OpTypeInt 16 0
+; CHECK-DAG: %[[TyInt32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyInt64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyFloat:.*]] = OpTypeFloat 32
+; CHECK-DAG: %[[TyHalf:.*]] = OpTypeFloat 16
+; CHECK-DAG: %[[TyDouble:.*]] = OpTypeFloat 64
+; CHECK-DAG: %[[ScopeSubgroup:.*]] = OpConstant %[[TyInt32]] 3
+; CHECK-DAG: %[[ConstInt2:.*]] = OpConstant %[[TyInt32]] 2
+; CHECK-DAG: %[[ConstInt4:.*]] = OpConstant %[[TyInt32]] 4
+		
+target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
+target triple = "spir"
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !5 !kernel_arg_base_type !5 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z16sub_group_rotateci(i8 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i8 @_Z26sub_group_clustered_rotatecij(i8 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUChar(ptr addrspace(1) noundef align 1 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !7 !kernel_arg_base_type !7 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i8, align 1
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i8 0, ptr %v, align 1
+  %value = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i8, ptr addrspace(1) %data, i32 0
+  store i8 %call, ptr addrspace(1) %arrayidx, align 1
+  %value_clustered = load i8, ptr %v, align 1
+; CHECK: OpGroupNonUniformRotateKHR %[[TyInt8]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i8, ptr addrspace(1) %data2, i32 1
+  store i8 %call1, ptr addrspace(1) %arrayidx2, align 1
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z16sub_group_rotatehi(i8 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i8 @_Z26sub_group_clustered_rotatehij(i8 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !8 !kernel_arg_base_type !8 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z16sub_group_rotatesi(i16 noundef signext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func signext i16 @_Z26sub_group_clustered_rotatesij(i16 noundef signext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUShort(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i16, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i16 0, ptr %v, align 2
+  %value = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i16, ptr addrspace(1) %data, i32 0
+  store i16 %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load i16, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt16]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i16, ptr addrspace(1) %data2, i32 1
+  store i16 %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z16sub_group_rotateti(i16 noundef zeroext, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func zeroext i16 @_Z26sub_group_clustered_rotatetij(i16 noundef zeroext, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateii(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateii(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotateiij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateUInt(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i32, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i32 0, ptr %v, align 4
+  %value = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i32 @_Z16sub_group_rotateji(i32 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i32, ptr addrspace(1) %data, i32 0
+  store i32 %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load i32, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt32]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr addrspace(1) %data2, i32 1
+  store i32 %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z16sub_group_rotateji(i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i32 @_Z26sub_group_clustered_rotatejij(i32 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateLong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotateli(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotateli(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatelij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateULong(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca i64, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store i64 0, ptr %v, align 8
+  %value = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func i64 @_Z16sub_group_rotatemi(i64 noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds i64, ptr addrspace(1) %data, i32 0
+  store i64 %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load i64, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyInt64]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds i64, ptr addrspace(1) %data2, i32 1
+  store i64 %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z16sub_group_rotatemi(i64 noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func i64 @_Z26sub_group_clustered_rotatemij(i64 noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateFloat(ptr addrspace(1) noundef align 4 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca float, align 4
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store float 0.000000e+00, ptr %v, align 4
+  %value = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func float @_Z16sub_group_rotatefi(float noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds float, ptr addrspace(1) %data, i32 0
+  store float %call, ptr addrspace(1) %arrayidx, align 4
+  %value_clustered = load float, ptr %v, align 4
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyFloat]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func float @_Z26sub_group_clustered_rotatefij(float noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds float, ptr addrspace(1) %data2, i32 1
+  store float %call1, ptr addrspace(1) %arrayidx2, align 4
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z16sub_group_rotatefi(float noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func float @_Z26sub_group_clustered_rotatefij(float noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateHalf(ptr addrspace(1) noundef align 2 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca half, align 2
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store half 0xH0000, ptr %v, align 2
+  %value = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func half @_Z16sub_group_rotateDhi(half noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds half, ptr addrspace(1) %data, i32 0
+  store half %call, ptr addrspace(1) %arrayidx, align 2
+  %value_clustered = load half, ptr %v, align 2
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyHalf]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr addrspace(1) %data2, i32 1
+  store half %call1, ptr addrspace(1) %arrayidx2, align 2
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z16sub_group_rotateDhi(half noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func half @_Z26sub_group_clustered_rotateDhij(half noundef, i32 noundef, i32 noundef) #1
+
+; Function Attrs: convergent noinline norecurse nounwind optnone
+define dso_local spir_kernel void @testRotateDouble(ptr addrspace(1) noundef align 8 %dst) #0 !kernel_arg_addr_space !3 !kernel_arg_access_qual !4 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !6 {
+entry:
+  %dst.addr = alloca ptr addrspace(1), align 4
+  %v = alloca double, align 8
+  store ptr addrspace(1) %dst, ptr %dst.addr, align 4
+  store double 0.000000e+00, ptr %v, align 8
+  %value = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]]
+  %call = call spir_func double @_Z16sub_group_rotatedi(double noundef %value, i32 noundef 2) #2
+  %data = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx = getelementptr inbounds double, ptr addrspace(1) %data, i32 0
+  store double %call, ptr addrspace(1) %arrayidx, align 8
+  %value_clustered = load double, ptr %v, align 8
+  ; CHECK: OpGroupNonUniformRotateKHR %[[TyDouble]] %[[ScopeSubgroup]] %[[#]] %[[ConstInt2]] %[[ConstInt4]]
+  %call1 = call spir_func double @_Z26sub_group_clustered_rotatedij(double noundef %value_clustered, i32 noundef 2, i32 noundef 4) #2
+  %data2 = load ptr addrspace(1), ptr %dst.addr, align 4
+  %arrayidx2 = getelementptr inbounds double, ptr addrspace(1) %data2, i32 1
+  store double %call1, ptr addrspace(1) %arrayidx2, align 8
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z16sub_group_rotatedi(double noundef, i32 noundef) #1
+
+; Function Attrs: convergent nounwind
+declare spir_func double @_Z26sub_group_clustered_rotatedij(double noundef, i32 noundef, i32 noundef) #1
+
+attributes #0 = { convergent noinline norecurse nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="false" }
+attributes #1 = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { convergent nounwind }
+
+!llvm.module.flags = !{!0}
+!opencl.ocl.version = !{!1}
+!opencl.spir.version = !{!1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 2, i32 0}
+!2 = !{!"clang version 19.0.0"}
+!3 = !{i32 1}
+!4 = !{!"none"}
+!5 = !{!"char*"}
+!6 = !{!""}
+!7 = !{!"uchar*"}
+!8 = !{!"short*"}
+!9 = !{!"ushort*"}
+!10 = !{!"int*"}
+!11 = !{!"uint*"}
+!12 = !{!"long*"}
+!13 = !{!"ulong*"}
+!14 = !{!"float*"}
+!15 = !{!"half*"}
+!16 = !{!"double*"}
diff --git a/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
new file mode 100644
index 0000000..c9c0f17
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/switch-no-jump-table.ll
@@ -0,0 +1,30 @@
+; The test is to check that jump tables are not generated from switch
+
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK: OpSwitch %[[#]] %[[Label:]]
+; CHECK-4: OpBranch %[[Label]]
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+define spir_func void @foo(i32 noundef %val) {
+entry:
+  switch i32 %val, label %sw.epilog [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb2
+    i32 2, label %sw.bb3
+    i32 3, label %sw.bb4
+  ]
+sw.bb:
+  br label %sw.epilog
+sw.bb2:
+  br label %sw.epilog
+sw.bb3:
+  br label %sw.epilog
+sw.bb4:
+  br label %sw.epilog
+sw.epilog:
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
index eb0ff4b..3ff6324 100644
--- a/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-strict-conv-17.ll
@@ -20,7 +20,7 @@ declare i128 @llvm.experimental.constrained.fptoui.i128.f64(double, metadata)
 declare i128 @llvm.experimental.constrained.fptoui.i128.f32(float, metadata)
 
 ; Test signed i128->f128.
-define fp128 @f1(i128 %i) {
+define fp128 @f1(i128 %i) #0 {
 ; CHECK-LABEL: f1:
 ; CHECK: brasl %r14, __floattitf@PLT
 ; CHECK: br %r14
@@ -31,7 +31,7 @@ define fp128 @f1(i128 %i) {
 }
 
 ; Test signed i128->f64.
-define double @f2(i128 %i) {
+define double @f2(i128 %i) #0 {
 ; CHECK-LABEL: f2:
 ; CHECK: brasl %r14, __floattidf@PLT
 ; CHECK: br %r14
@@ -42,7 +42,7 @@ define double @f2(i128 %i) {
 }
 
 ; Test signed i128->f32.
-define float @f3(i128 %i) {
+define float @f3(i128 %i) #0 {
 ; CHECK-LABEL: f3:
 ; CHECK: brasl %r14, __floattisf@PLT
 ; CHECK: br %r14
@@ -53,7 +53,7 @@ define float @f3(i128 %i) {
 }
 
 ; Test unsigned i128->f128.
-define fp128 @f4(i128 %i) {
+define fp128 @f4(i128 %i) #0 {
 ; CHECK-LABEL: f4:
 ; CHECK: brasl %r14, __floatuntitf@PLT
 ; CHECK: br %r14
@@ -64,7 +64,7 @@ define fp128 @f4(i128 %i) {
 }
 
 ; Test unsigned i128->f64.
-define double @f5(i128 %i) {
+define double @f5(i128 %i) #0 {
 ; CHECK-LABEL: f5:
 ; CHECK: brasl %r14, __floatuntidf@PLT
 ; CHECK: br %r14
@@ -75,7 +75,7 @@ define double @f5(i128 %i) {
 }
 
 ; Test unsigned i128->f32.
-define float @f6(i128 %i) {
+define float @f6(i128 %i) #0 {
 ; CHECK-LABEL: f6:
 ; CHECK: brasl %r14, __floatuntisf@PLT
 ; CHECK: br %r14
@@ -86,7 +86,7 @@ define float @f6(i128 %i) {
 }
 
 ; Test signed f128->i128.
-define i128 @f7(fp128 %f) {
+define i128 @f7(fp128 %f) #0 {
 ; CHECK-LABEL: f7:
 ; CHECK: brasl %r14, __fixtfti@PLT
 ; CHECK: br %r14
@@ -96,7 +96,7 @@ define i128 @f7(fp128 %f) {
 }
 
 ; Test signed f64->i128.
-define i128 @f8(double %f) {
+define i128 @f8(double %f) #0 {
 ; CHECK-LABEL: f8:
 ; CHECK: brasl %r14, __fixdfti@PLT
 ; CHECK: br %r14
@@ -106,7 +106,7 @@ define i128 @f8(double %f) {
 }
 
 ; Test signed f9->i128.
-define i128 @f9(float %f) {
+define i128 @f9(float %f) #0 {
 ; CHECK-LABEL: f9:
 ; CHECK: brasl %r14, __fixsfti@PLT
 ; CHECK: br %r14
@@ -116,7 +116,7 @@ define i128 @f9(float %f) {
 }
 
 ; Test unsigned f128->i128.
-define i128 @f10(fp128 %f) {
+define i128 @f10(fp128 %f) #0 {
 ; CHECK-LABEL: f10:
 ; CHECK: brasl %r14, __fixunstfti@PLT
 ; CHECK: br %r14
@@ -126,7 +126,7 @@ define i128 @f10(fp128 %f) {
 }
 
 ; Test unsigned f64->i128.
-define i128 @f11(double %f) {
+define i128 @f11(double %f) #0 {
 ; CHECK-LABEL: f11:
 ; CHECK: brasl %r14, __fixunsdfti@PLT
 ; CHECK: br %r14
@@ -136,7 +136,7 @@ define i128 @f11(double %f) {
 }
 
 ; Test unsigned f32->i128.
-define i128 @f12(float %f) {
+define i128 @f12(float %f) #0 {
 ; CHECK-LABEL: f12:
 ; CHECK: brasl %r14, __fixunssfti@PLT
 ; CHECK: br %r14
diff --git a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
index 4f33439..daf46c6 100644
--- a/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
+++ b/llvm/test/CodeGen/WebAssembly/lower-em-ehsjlj-multi-return.ll
@@ -1,5 +1,5 @@
-; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=EH
-; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 | FileCheck %s --check-prefix=SJLJ
+; RUN: not --crash llc < %s -enable-emscripten-cxx-exceptions -mattr=+multivalue -wasm-emit-multivalue 2>&1 | FileCheck %s --check-prefix=EH
+; RUN: not --crash llc < %s -enable-emscripten-sjlj -mattr=+multivalue 2>&1 -wasm-emit-multivalue | FileCheck %s --check-prefix=SJLJ
 
 ; Currently multivalue returning functions are not supported in Emscripten EH /
 ; SjLj. Make sure they error out.
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
index 4b4661b..4fadbd5 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-dont-move-def-past-use.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=wasm32-unknown-unknown -mattr=+multivalue -wasm-emit-multivalue -run-pass=wasm-reg-stackify -verify-machineinstrs %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
index 52a8c68..f4f93ac 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue-stackify.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Test functions have been generated by multivalue-stackify.py.
 
-; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s
 
 ; Test that the multivalue stackification works
 
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue.ll b/llvm/test/CodeGen/WebAssembly/multivalue.ll
index 675009c..846691e 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue.ll
@@ -1,7 +1,8 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call | FileCheck --check-prefix REF %s
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix REGS
-; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+reference-types,+multivalue,+tail-call -wasm-emit-multivalue | FileCheck --check-prefix REF %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | FileCheck %s --check-prefix REGS
+; RUN: llc < %s --filetype=obj -mcpu=mvp -mattr=+multivalue,+tail-call -wasm-emit-multivalue | obj2yaml | FileCheck %s --check-prefix OBJ
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -mcpu=mvp -mattr=+multivalue,+tail-call | FileCheck %s --check-prefix NO-MULTIVALUE
 
 ; Test that the multivalue calls, returns, function types, and block
 ; types work as expected.
@@ -19,6 +20,7 @@ declare void @use_i64(i64)
 ; CHECK-NEXT: i32.const 42{{$}}
 ; CHECK-NEXT: i64.const 42{{$}}
 ; CHECK-NEXT: end_function{{$}}
+; NO-MULTIVALUE-NOT: .functype pair_const () -> (i32, i64)
 define %pair @pair_const() {
   ret %pair { i32 42, i64 42 }
 }
diff --git a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
index 47c5ae7b..7bf37b5 100644
--- a/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
+++ b/llvm/test/CodeGen/WebAssembly/multivalue_libcall.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue | FileCheck %s --check-prefix=MULTIVALUE
+; RUN: llc < %s -verify-machineinstrs -mcpu=mvp -mattr=+multivalue -wasm-emit-multivalue | FileCheck %s --check-prefix=MULTIVALUE
 ; RUN: llc < %s -verify-machineinstrs -mcpu=mvp | FileCheck %s --check-prefix=NO_MULTIVALUE
 
 ; Test libcall signatures when multivalue is enabled and disabled
diff --git a/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
new file mode 100644
index 0000000..6a856c3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/GlobalISel/calllowering-tailcall.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i686-linux-gnu -global-isel < %s | FileCheck %s --check-prefix=X86
+
+declare ptr @foo()
+
+define ptr @aligned_tailcall() nounwind {
+; X64-LABEL: aligned_tailcall:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    callq foo
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+;
+; X86-LABEL: aligned_tailcall:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    calll foo
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+entry:
+  %call = tail call align 8 ptr @foo()
+  ret ptr %call
+}
diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
index 401ed9f7..8a9ee60 100644
--- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
+++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll
@@ -339,7 +339,7 @@ return:
 
 define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src) nounwind {
 ; CHECK-LABEL: strcpy_illegal_tailc:
-; CHECK:       ## %bb.0:
+; CHECK:       ## %bb.0: ## %entry
 ; CHECK-NEXT:    pushq %rbx
 ; CHECK-NEXT:    movq %rdx, %rbx
 ; CHECK-NEXT:    testq %rsi, %rsi
@@ -351,6 +351,7 @@ define ptr @strcpy_illegal_tailc(ptr %dest, i64 %sz, ptr readonly returned %src)
 ; CHECK-NEXT:    movq %rbx, %rax
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    retq
+entry:
   %cmp = icmp eq i64 %sz, 0
   br i1 %cmp, label %return, label %if.then
 
@@ -362,8 +363,63 @@ return:
   ret ptr %src
 }
 
+@i = global i32 0, align 4
+
+define i32 @undef_tailc() nounwind {
+; CHECK-LABEL: undef_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    cmpl $0, _i(%rip)
+; CHECK-NEXT:    jne _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  %cmp = icmp eq i32 %val, 0
+  br i1 %cmp, label %return, label %if.then
+
+if.then:
+  %rv_unused = tail call i32 @qux()
+  br label %return
+
+return:
+  ret i32 undef
+}
+
+define i32 @undef_and_known_tailc() nounwind {
+; CHECK-LABEL: undef_and_known_tailc:
+; CHECK:       ## %bb.0: ## %entry
+; CHECK-NEXT:    movl _i(%rip), %eax
+; CHECK-NEXT:    cmpl $5, %eax
+; CHECK-NEXT:    je _qux ## TAILCALL
+; CHECK-NEXT:  ## %bb.1: ## %entry
+; CHECK-NEXT:    cmpl $2, %eax
+; CHECK-NEXT:    je _quux ## TAILCALL
+; CHECK-NEXT:  ## %bb.2: ## %return
+; CHECK-NEXT:    retq
+entry:
+  %val = load i32, ptr @i, align 4
+  switch i32 %val, label %return [
+    i32 2, label %case_2
+    i32 5, label %case_5
+  ]
+
+case_2:
+  %rv_unused = tail call i32 @quux()
+  br label %return
+
+case_5:
+  %rv = tail call i32 @qux()
+  br label %return
+
+return:
+  %phi = phi i32 [ undef, %case_2 ], [ %rv, %case_5 ], [ undef, %entry ]
+  ret i32 %phi
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1)
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1)
 declare noalias ptr @malloc(i64)
 declare ptr @strcpy(ptr noalias returned writeonly, ptr noalias nocapture readonly)
 declare ptr @baz(ptr, ptr)
+declare i32 @qux()
+declare i32 @quux()