38 files changed, 28469 insertions, 7260 deletions
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index b54f262..4894932 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -755,199 +755,117 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
 ; CHECK-SD-NEXT:    cbz w2, .LBB6_3
 ; CHECK-SD-NEXT:  // %bb.1: // %iter.check
-; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
-; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
-; CHECK-SD-NEXT:    .cfi_offset w19, -8
-; CHECK-SD-NEXT:    .cfi_offset w20, -16
-; CHECK-SD-NEXT:    .cfi_offset w21, -24
-; CHECK-SD-NEXT:    .cfi_offset w22, -32
-; CHECK-SD-NEXT:    .cfi_offset w23, -40
-; CHECK-SD-NEXT:    .cfi_offset w24, -48
-; CHECK-SD-NEXT:    .cfi_offset w25, -64
-; CHECK-SD-NEXT:    sxtb x9, w1
 ; CHECK-SD-NEXT:    cmp w2, #3
-; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    mov w9, w2
 ; CHECK-SD-NEXT:    b.hi .LBB6_4
 ; CHECK-SD-NEXT:  // %bb.2:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_3:
-; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
-; CHECK-SD-NEXT:    dup v0.2d, x9
 ; CHECK-SD-NEXT:    cmp w2, #16
 ; CHECK-SD-NEXT:    b.hs .LBB6_6
 ; CHECK-SD-NEXT:  // %bb.5:
-; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x10, xzr
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    b .LBB6_10
 ; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    mov w8, w1
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
-; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    sxtb x8, w8
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
 ; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x9, #0xc
 ; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x15, x0
 ; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
-; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-SD-NEXT:    fmov x13, d0
-; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-SD-NEXT:    dup v16.4s, w8
+; CHECK-SD-NEXT:    mov x8, x0
+; CHECK-SD-NEXT:    and x12, x9, #0xfffffff0
 ; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr q17, [x15], #16
-; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ldr q17, [x8], #16
+; CHECK-SD-NEXT:    subs x12, x12, #16
 ; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
-; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
-; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
-; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
-; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
-; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
-; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
-; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
-; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
-; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
-; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
-; CHECK-SD-NEXT:    fmov x17, d21
-; CHECK-SD-NEXT:    mov x2, v21.d[1]
-; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
-; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
-; CHECK-SD-NEXT:    fmov x18, d22
-; CHECK-SD-NEXT:    fmov x1, d17
-; CHECK-SD-NEXT:    fmov x3, d23
-; CHECK-SD-NEXT:    fmov x21, d20
-; CHECK-SD-NEXT:    fmov x22, d18
-; CHECK-SD-NEXT:    fmov x19, d21
-; CHECK-SD-NEXT:    mul x17, x13, x17
-; CHECK-SD-NEXT:    mov x4, v22.d[1]
-; CHECK-SD-NEXT:    fmov x24, d19
-; CHECK-SD-NEXT:    mov x5, v23.d[1]
-; CHECK-SD-NEXT:    mov x6, v21.d[1]
-; CHECK-SD-NEXT:    mov x7, v20.d[1]
-; CHECK-SD-NEXT:    mov x20, v18.d[1]
-; CHECK-SD-NEXT:    mov x23, v19.d[1]
-; CHECK-SD-NEXT:    mov x25, v17.d[1]
-; CHECK-SD-NEXT:    mul x18, x14, x18
-; CHECK-SD-NEXT:    mul x1, x13, x1
-; CHECK-SD-NEXT:    fmov d17, x17
-; CHECK-SD-NEXT:    mul x3, x13, x3
-; CHECK-SD-NEXT:    fmov d18, x18
-; CHECK-SD-NEXT:    mul x19, x13, x19
-; CHECK-SD-NEXT:    fmov d19, x1
-; CHECK-SD-NEXT:    mul x21, x13, x21
-; CHECK-SD-NEXT:    fmov d20, x3
-; CHECK-SD-NEXT:    mul x22, x13, x22
-; CHECK-SD-NEXT:    fmov d21, x19
-; CHECK-SD-NEXT:    mul x24, x13, x24
-; CHECK-SD-NEXT:    fmov d24, x21
-; CHECK-SD-NEXT:    mul x2, x8, x2
-; CHECK-SD-NEXT:    fmov d22, x22
-; CHECK-SD-NEXT:    mul x4, x8, x4
-; CHECK-SD-NEXT:    fmov d23, x24
-; CHECK-SD-NEXT:    mul x5, x8, x5
-; CHECK-SD-NEXT:    mov v17.d[1], x2
-; CHECK-SD-NEXT:    mul x6, x8, x6
-; CHECK-SD-NEXT:    mov v18.d[1], x4
-; CHECK-SD-NEXT:    mul x7, x8, x7
-; CHECK-SD-NEXT:    mov v20.d[1], x5
-; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
-; CHECK-SD-NEXT:    mul x20, x8, x20
-; CHECK-SD-NEXT:    mov v21.d[1], x6
-; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
-; CHECK-SD-NEXT:    mul x23, x8, x23
-; CHECK-SD-NEXT:    mov v24.d[1], x7
-; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
-; CHECK-SD-NEXT:    mul x17, x8, x25
-; CHECK-SD-NEXT:    mov v22.d[1], x20
-; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
-; CHECK-SD-NEXT:    mov v23.d[1], x23
-; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
-; CHECK-SD-NEXT:    mov v19.d[1], x17
-; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
-; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
-; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll2 v19.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-SD-NEXT:    ushll v18.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-SD-NEXT:    smlal2 v2.2d, v16.4s, v19.4s
+; CHECK-SD-NEXT:    smlal2 v4.2d, v16.4s, v20.4s
+; CHECK-SD-NEXT:    smlal v6.2d, v16.2s, v20.2s
+; CHECK-SD-NEXT:    smlal v3.2d, v16.2s, v19.2s
+; CHECK-SD-NEXT:    smlal2 v1.2d, v16.4s, v18.4s
+; CHECK-SD-NEXT:    smlal v7.2d, v16.2s, v17.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v16.2s, v18.2s
+; CHECK-SD-NEXT:    smlal2 v5.2d, v16.4s, v17.4s
 ; CHECK-SD-NEXT:    b.ne .LBB6_7
 ; CHECK-SD-NEXT:  // %bb.8: // %middle.block
-; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
-; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
-; CHECK-SD-NEXT:    cmp x11, x10
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
-; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v6.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v7.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
-; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
 ; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    addp d1, v1.2d
-; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
-; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:    cbz x11, .LBB6_13
 ; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov w11, w1
 ; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
-; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    sxtb x11, w11
 ; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
-; CHECK-SD-NEXT:    fmov x14, d0
-; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-SD-NEXT:    fmov x15, d0
-; CHECK-SD-NEXT:    sub x12, x13, x11
-; CHECK-SD-NEXT:    add x13, x0, x13
-; CHECK-SD-NEXT:    mov v1.d[0], x8
-; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    dup v2.2s, w11
+; CHECK-SD-NEXT:    mov x11, x10
+; CHECK-SD-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-SD-NEXT:    mov v0.d[0], x8
+; CHECK-SD-NEXT:    sub x8, x11, x10
+; CHECK-SD-NEXT:    add x11, x0, x11
 ; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldr s0, [x13], #4
-; CHECK-SD-NEXT:    adds x12, x12, #4
-; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
-; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
-; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    ldr s4, [x11], #4
+; CHECK-SD-NEXT:    adds x8, x8, #4
+; CHECK-SD-NEXT:    ushll v4.8h, v4.8b, #0
+; CHECK-SD-NEXT:    ushll v4.4s, v4.4h, #0
+; CHECK-SD-NEXT:    ushll v5.2d, v4.2s, #0
+; CHECK-SD-NEXT:    ushll2 v4.2d, v4.4s, #0
+; CHECK-SD-NEXT:    and v5.16b, v5.16b, v3.16b
 ; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
-; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
-; CHECK-SD-NEXT:    fmov x16, d4
-; CHECK-SD-NEXT:    fmov x18, d0
-; CHECK-SD-NEXT:    mov x17, v4.d[1]
-; CHECK-SD-NEXT:    mov x1, v0.d[1]
-; CHECK-SD-NEXT:    mul x16, x14, x16
-; CHECK-SD-NEXT:    mul x18, x15, x18
-; CHECK-SD-NEXT:    mul x17, x8, x17
-; CHECK-SD-NEXT:    fmov d0, x16
-; CHECK-SD-NEXT:    mul x1, x8, x1
-; CHECK-SD-NEXT:    fmov d4, x18
-; CHECK-SD-NEXT:    mov v0.d[1], x17
-; CHECK-SD-NEXT:    mov v4.d[1], x1
-; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
-; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
+; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
+; CHECK-SD-NEXT:    smlal v1.2d, v2.2s, v4.2s
+; CHECK-SD-NEXT:    smlal v0.2d, v2.2s, v5.2s
 ; CHECK-SD-NEXT:    b.ne .LBB6_11
 ; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
-; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
-; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    cmp x10, x9
 ; CHECK-SD-NEXT:    addp d0, v0.2d
 ; CHECK-SD-NEXT:    fmov x8, d0
 ; CHECK-SD-NEXT:    b.eq .LBB6_15
 ; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
-; CHECK-SD-NEXT:    sub x10, x10, x11
-; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:    sxtb x11, w1
+; CHECK-SD-NEXT:    sub x9, x9, x10
+; CHECK-SD-NEXT:    add x10, x0, x10
 ; CHECK-SD-NEXT:  .LBB6_14: // %for.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-SD-NEXT:    ldrb w12, [x11], #1
-; CHECK-SD-NEXT:    subs x10, x10, #1
-; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    ldrb w12, [x10], #1
+; CHECK-SD-NEXT:    subs x9, x9, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w11, x8
 ; CHECK-SD-NEXT:    b.ne .LBB6_14
-; CHECK-SD-NEXT:  .LBB6_15:
-; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
-; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB6_15: // %for.cond.cleanup
 ; CHECK-SD-NEXT:    mov x0, x8
 ; CHECK-SD-NEXT:    ret
 ;
@@ -957,63 +875,64 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:    cbz w2, .LBB6_7
 ; CHECK-GI-NEXT:  // %bb.1: // %iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    sxtb x9, w1
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #4
-; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    mov w9, w2
 ; CHECK-GI-NEXT:    b.lo .LBB6_12
 ; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
 ; CHECK-GI-NEXT:    movi d0, #0000000000000000
-; CHECK-GI-NEXT:    dup v1.2d, x9
-; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    mov x10, xzr
 ; CHECK-GI-NEXT:    cmp w2, #16
 ; CHECK-GI-NEXT:    b.lo .LBB6_9
 ; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
-; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
-; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffff0
+; CHECK-GI-NEXT:    dup v5.2d, x8
 ; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
-; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
-; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
-; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x8, x9, #0xc
+; CHECK-GI-NEXT:    mov x11, x0
+; CHECK-GI-NEXT:    and x12, x9, #0xfffffff0
+; CHECK-GI-NEXT:    xtn v16.2s, v5.2d
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
 ; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr q18, [x12], #16
-; CHECK-GI-NEXT:    subs x13, x13, #16
-; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
-; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
-; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
-; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
-; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ldr q17, [x11], #16
+; CHECK-GI-NEXT:    subs x12, x12, #16
+; CHECK-GI-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
+; CHECK-GI-NEXT:    ushll v19.4s, v18.4h, #0
 ; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
-; CHECK-GI-NEXT:    mov d22, v20.d[1]
-; CHECK-GI-NEXT:    mov d23, v19.d[1]
-; CHECK-GI-NEXT:    mov d24, v21.d[1]
-; CHECK-GI-NEXT:    mov d25, v18.d[1]
-; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
-; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
-; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
-; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
-; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
-; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
-; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
-; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    ushll v20.4s, v17.4h, #0
+; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
+; CHECK-GI-NEXT:    mov d21, v19.d[1]
+; CHECK-GI-NEXT:    mov d22, v18.d[1]
+; CHECK-GI-NEXT:    mov d23, v20.d[1]
+; CHECK-GI-NEXT:    mov d24, v17.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v16.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v16.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v16.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v16.2s, v17.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v16.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v16.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v16.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v16.2s, v24.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_4
 ; CHECK-GI-NEXT:  // %bb.5: // %middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
-; CHECK-GI-NEXT:    cmp x11, x10
 ; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
-; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    add v1.2d, v2.2d, v3.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    b.ne .LBB6_8
 ; CHECK-GI-NEXT:  // %bb.6:
@@ -1027,50 +946,54 @@ define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none
 ; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
 ; CHECK-GI-NEXT:    cbz x8, .LBB6_12
 ; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov w8, w1
 ; CHECK-GI-NEXT:    mov v0.d[1], xzr
-; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
-; CHECK-GI-NEXT:    mov x12, x11
-; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
-; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
-; CHECK-GI-NEXT:    sub x8, x12, x11
-; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x8, w8
+; CHECK-GI-NEXT:    mov x11, x10
+; CHECK-GI-NEXT:    and x10, x9, #0xfffffffc
+; CHECK-GI-NEXT:    dup v2.2d, x8
+; CHECK-GI-NEXT:    sub x8, x11, x10
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    xtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    ldr w12, [x11], #4
 ; CHECK-GI-NEXT:    adds x8, x8, #4
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    uxtb w12, w12
 ; CHECK-GI-NEXT:    mov b4, v3.b[2]
 ; CHECK-GI-NEXT:    mov b5, v3.b[1]
 ; CHECK-GI-NEXT:    mov b6, v3.b[3]
-; CHECK-GI-NEXT:    fmov s3, w13
-; CHECK-GI-NEXT:    fmov w14, s4
-; CHECK-GI-NEXT:    fmov w15, s5
-; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    fmov s3, w12
+; CHECK-GI-NEXT:    fmov w13, s4
+; CHECK-GI-NEXT:    fmov w14, s5
+; CHECK-GI-NEXT:    fmov w15, s6
+; CHECK-GI-NEXT:    uxtb w13, w13
 ; CHECK-GI-NEXT:    uxtb w14, w14
 ; CHECK-GI-NEXT:    uxtb w15, w15
-; CHECK-GI-NEXT:    uxtb w16, w16
-; CHECK-GI-NEXT:    fmov s4, w14
-; CHECK-GI-NEXT:    mov v3.s[1], w15
-; CHECK-GI-NEXT:    mov v4.s[1], w16
-; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
-; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    fmov s4, w13
+; CHECK-GI-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NEXT:    mov v4.s[1], w15
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v1.2d, v2.2s, v4.2s
 ; CHECK-GI-NEXT:    b.ne .LBB6_10
 ; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
-; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
-; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    cmp x10, x9
 ; CHECK-GI-NEXT:    addp d0, v0.2d
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    b.eq .LBB6_14
 ; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
-; CHECK-GI-NEXT:    sub x10, x10, x11
-; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:    sxtb x11, w1
+; CHECK-GI-NEXT:    sub x9, x9, x10
+; CHECK-GI-NEXT:    add x10, x0, x10
 ; CHECK-GI-NEXT:  .LBB6_13: // %for.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    ldrb w8, [x10], #1
 ; CHECK-GI-NEXT:    fmov x12, d0
-; CHECK-GI-NEXT:    subs x10, x10, #1
-; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    subs x9, x9, #1
+; CHECK-GI-NEXT:    madd x8, x8, x11, x12
 ; CHECK-GI-NEXT:    fmov d0, x8
 ; CHECK-GI-NEXT:    b.ne .LBB6_13
 ; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
index bdbc99e..75e7ac90 100644
--- a/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
+++ b/llvm/test/CodeGen/AArch64/cfguard-arm64ec.ll
@@ -2,15 +2,58 @@
 
 declare void @called()
 declare void @escaped()
-define void @f(ptr %dst) {
+define void @f(ptr %dst, ptr readonly %f) {
   call void @called()
+; CHECK:         bl      "#called"
   store ptr @escaped, ptr %dst
-  ret void
+  call void %f()
+; CHECK:       adrp    x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  add     x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:  str     x8, [x20]
+; CHECK-NEXT:  adrp    x8, __os_arm64x_check_icall_cfg
+; CHECK-NEXT:  ldr     x8, [x8, :lo12:__os_arm64x_check_icall_cfg]
+; CHECK-NEXT:  mov     x11,
+; CHECK-NEXT:  blr     x8
+; CHECK-NEXT:  blr     x11
+    ret void
 }
 
+; CHECK-LABEL:    .def "#called$exit_thunk";
+; CHECK-NEXT:     .scl 2;
+; CHECK-NEXT:     .type 32;
+; CHECK-NEXT:     .endef
+; CHECK-NEXT:     .section .wowthk$aa,"xr",discard,"#called$exit_thunk"
+; CHECK-NEXT:     .globl "#called$exit_thunk"            // -- Begin function #called$exit_thunk
+; CHECK-NEXT:     .p2align 2
+; CHECK-NEXT: "#called$exit_thunk":                   // @"#called$exit_thunk"
+; CHECK-NEXT:     .weak_anti_dep called
+; CHECK-NEXT: called = "#called"
+; CHECK-NEXT:     .weak_anti_dep "#called"
+; CHECK-NEXT: "#called" = "#called$exit_thunk"
+; CHECK-NEXT:    .seh_proc "#called$exit_thunk"
+; CHECK-NEXT: // %bb.0:
+; CHECK-NEXT:     str x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp x11, called
+; CHECK-NEXT:     add x11, x11, :lo12:called
+; CHECK-NEXT:     ldr x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp x10, $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     add x10, x10, :lo12:$iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     blr x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
 !llvm.module.flags = !{!0}
-!0 = !{i32 2, !"cfguard", i32 1}
+!0 = !{i32 2, !"cfguard", i32 2}
 
 ; CHECK-LABEL: .section .gfids$y,"dr"
 ; CHECK-NEXT:  .symidx escaped
+; CHECK-NEXT:  .symidx $iexit_thunk$cdecl$v$v
 ; CHECK-NOT:   .symidx
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
index 35eafe8..f535e0f 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -68,13 +68,9 @@
 # CHECK:      early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
 #
 # CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
@@ -83,14 +79,10 @@
 # CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
 # CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -100,38 +92,26 @@
 # ASM:       str x29, [sp, #-16]!
 # ASM-NEXT:  .cfi_def_cfa_offset 16
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  .cfi_def_cfa_offset 2080
+# ASM-NEXT:  addvl sp, sp, #-3
 # ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
 #
-# ASM:	     addvl sp, sp, #2
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  .cfi_def_cfa wsp, 1056
-# ASM-NEXT:  add sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM:	     add sp, sp, #2064
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT:  addvl   sp, sp, #3
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldr x29, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w29
 
 # UNWINDINFO:      DW_CFA_def_cfa_offset: +16
 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO:      DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa_offset: +2080
 # UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
 #
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO:      DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO:      DW_CFA_def_cfa_offset: +16
+# UNWINDINFO:      DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO:      DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO:      DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT: DW_CFA_restore: reg29
 
@@ -270,13 +250,9 @@ body:             |
 # CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 2080
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
 #
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
@@ -286,14 +262,10 @@ body:             |
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
 # CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
-# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
@@ -303,38 +275,27 @@ body:             |
 # ASM:       str x29, [sp, #-16]!
 # ASM-NEXT:  .cfi_def_cfa_offset 16
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  .cfi_def_cfa_offset 1040
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  .cfi_def_cfa_offset 2080
+# ASM-NEXT:  addvl sp, sp, #-3
 # ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
 #
-# ASM:       addvl sp, sp, #2
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  .cfi_def_cfa wsp, 1056
-# ASM-NEXT:  add sp, sp, #1040
-# ASM-NEXT:  .cfi_def_cfa_offset 16
+# ASM:       add sp, sp, #2064
+# ASM-NEXT:  .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+# ASM-NEXT:  addvl   sp, sp, #3
+# ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldr x29, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
 # ASM-NEXT:  .cfi_restore w29
+# ASM-NEXT:  ret
 
 # UNWINDINFO: DW_CFA_def_cfa_offset: +16
 # UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
-# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_offset: +2080
 # UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
 #
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
-# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
-# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
 # UNWINDINFO: DW_CFA_def_cfa_offset: +0
 # UNWINDINFO-NEXT: DW_CFA_restore: reg29
 
@@ -385,10 +346,8 @@ body:             |
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
 # CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
-# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
-# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
 #
 # CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
 # CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
@@ -396,10 +355,8 @@ body:             |
 # CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
 # CHECK-NEXT: STR_PXI $p0, $fp, -1
 #
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
-# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
-# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 2064, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
 # CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
 # CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
@@ -414,15 +371,11 @@ body:             |
 # ASM-NEXT:  .cfi_def_cfa w29, 16
 # ASM-NEXT:  .cfi_offset w30, -8
 # ASM-NEXT:  .cfi_offset w29, -16
-# ASM-NEXT:  sub sp, sp, #1024
-# ASM-NEXT:  addvl sp, sp, #-1
-# ASM-NEXT:  sub sp, sp, #1040
-# ASM-NEXT:  addvl sp, sp, #-2
+# ASM-NEXT:  sub sp, sp, #2064
+# ASM-NEXT:  addvl sp, sp, #-3
 #
-# ASM:       addvl sp, sp, #2
-# ASM-NEXT:  add sp, sp, #1024
-# ASM-NEXT:  addvl sp, sp, #1
-# ASM-NEXT:  add sp, sp, #1040
+# ASM:       add sp, sp, #2064
+# ASM-NEXT:  addvl sp, sp, #3
 # ASM-NEXT:  .cfi_def_cfa wsp, 16
 # ASM-NEXT:  ldp x29, x30, [sp], #16
 # ASM-NEXT:  .cfi_def_cfa_offset 0
diff --git a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
index 71c6380..8a0ac6d 100644
--- a/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
+++ b/llvm/test/CodeGen/AArch64/replace-with-veclib-armpl.ll
@@ -780,6 +780,7 @@ define <vscale x 4 x float> @llvm_tanh_vscale_f32(<vscale x 4 x float> %in) #0 {
 
 attributes #0 = { "target-features"="+sve" }
 ;.
-; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR1]] = { "target-features"="+sve" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 ;.
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
index 690a39d..c13dd33 100644
--- a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -19,20 +19,16 @@ define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vec
 ; CHECK-LABEL: zpr_and_ppr_local:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    add x8, sp, #2048
 ; CHECK-NEXT:    str p0, [x8, #15, mul vl]
 ; CHECK-NEXT:    add x8, sp, #1024
 ; CHECK-NEXT:    str z0, [x8]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -62,20 +58,16 @@ define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    sub x8, x29, #1024
 ; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str z0, [x8, #-2, mul vl]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2048
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -103,17 +95,15 @@ define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch6
 ; CHECK-LABEL: fpr_and_ppr_local:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    sub sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1040
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    add x8, sp, #2064
 ; CHECK-NEXT:    str p0, [x8, #7, mul vl]
 ; CHECK-NEXT:    str d0, [sp, #1032]
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1040
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -144,17 +134,15 @@ define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aar
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
-; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    sub sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    sub sp, sp, #1040
 ; CHECK-NEXT:    .cfi_def_cfa w29, 16
 ; CHECK-NEXT:    .cfi_offset w30, -8
 ; CHECK-NEXT:    .cfi_offset w29, -16
 ; CHECK-NEXT:    str p0, [x29, #-1, mul vl]
 ; CHECK-NEXT:    str d0, [sp, #1032]
-; CHECK-NEXT:    add sp, sp, #1024
+; CHECK-NEXT:    add sp, sp, #2064
 ; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1040
 ; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %ppr_local = alloca <vscale x 16 x i1>
@@ -793,11 +781,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
 ; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #-1
-; CHECK-NEXT:    str xzr, [sp]
-; CHECK-NEXT:    sub sp, sp, #1824
-; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    sub sp, sp, #2848
+; CHECK-NEXT:    addvl sp, sp, #-2
 ; CHECK-NEXT:    str xzr, [sp]
 ; CHECK-NEXT:    .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
 ; CHECK-NEXT:    .cfi_offset w29, -16
@@ -806,10 +791,8 @@ define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x
 ; CHECK-NEXT:    add x8, sp, #1824
 ; CHECK-NEXT:    str z0, [x8]
 ; CHECK-NEXT:    str x0, [sp]
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1024
-; CHECK-NEXT:    addvl sp, sp, #1
-; CHECK-NEXT:    add sp, sp, #1824
+; CHECK-NEXT:    add sp, sp, #2848
+; CHECK-NEXT:    addvl sp, sp, #2
 ; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
index becddae..b2ed8de 100644
--- a/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-loads-stores.ll
@@ -1,19 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -aarch64-sve-vector-bits-min=128  < %s | not grep ptrue
 ; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
-; RUN: llc -aarch64-sve-vector-bits-min=384  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
 ; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=640  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=768  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
-; RUN: llc -aarch64-sve-vector-bits-min=896  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
 ; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
-; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024
 ; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048
 
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index e86f747..37b5422 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -new-reg-bank-select -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
index 44b12a9..61a6137 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mmra.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 -stop-after=finalize-isel < %s | FileCheck %s
 
 declare void @readsMem(ptr) #0
 declare void @writesMem(ptr) #1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
new file mode 100644
index 0000000..06150e42
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s 2>&1 | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s 2>&1 | FileCheck -check-prefix=GFX942 %s
+
+; These situations are "special" in that they either have an alloca that is not
+; in the entry block or that they have a dynamic alloca. Both situations affect
+; prolog/epilog generation.
+
+declare amdgpu_gfx void @foo()
+
+define amdgpu_cs_chain void @test_alloca() {
+; GFX12-LABEL: test_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s0, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s0, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s0, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 6
+; GFX942-NEXT:    s_mov_b32 s1, s32
+; GFX942-NEXT:    s_add_i32 s32, s1, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, s1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
+; GFX12-LABEL: test_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b32 s0, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call() {
+; GFX12-LABEL: test_alloca_and_call:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_mov_b32 s2, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s2, 0x200
+; GFX12-NEXT:    scratch_store_b32 off, v0, s2
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s2, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_add_i32 s32, s2, 0x400
+; GFX942-NEXT:    scratch_store_dword off, v0, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_alloca_and_call_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s1, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    s_add_i32 s32, s3, s2
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
+; GFX12-LABEL: test_alloca_and_call_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s1, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v1, s0, 5, s1
+; GFX12-NEXT:    scratch_store_b32 off, v0, s1
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_alloca_and_call_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v1, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s3, s32
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    v_lshl_add_u32 v1, s2, 6, v1
+; GFX942-NEXT:    scratch_store_dword off, v0, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  store i32 0, ptr addrspace(5) %v, align 4
+  call amdgpu_gfx void @foo()
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca() {
+; GFX12-LABEL: test_call_and_alloca:
+; GFX12:       ; %bb.0: ; %.entry
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[0:1]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s1, s1
+; GFX12-NEXT:    s_add_co_u32 s0, s0, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, 0x200
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    scratch_store_b32 off, v0, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca:
+; GFX942:       ; %bb.0: ; %.entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    s_add_i32 s32, s4, 0x400
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    scratch_store_dword off, v0, s4
+; GFX942-NEXT:    s_endpgm
+.entry:
+  br label %SW_C
+
+SW_C:                                             ; preds = %.entry
+  %v = alloca i32, i32 1, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var_uniform(i32 inreg %count) {
+; GFX12-LABEL: test_call_and_alloca_var_uniform:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_i32 s0, s0, 15
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_and_b32 s0, s0, -16
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshl_b32 s0, s0, 5
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s32, s4, s0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var_uniform:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    s_add_i32 s0, s0, 15
+; GFX942-NEXT:    s_and_b32 s0, s0, -16
+; GFX942-NEXT:    s_lshl_b32 s2, s0, 6
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_add_i32 s32, s4, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
+
+define amdgpu_cs_chain void @test_call_and_alloca_var(i32 %count) {
+; GFX12-LABEL: test_call_and_alloca_var:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX12-NEXT:    v_mov_b32_e32 v40, 0
+; GFX12-NEXT:    s_mov_b32 s1, exec_lo
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    s_mov_b32 s32, 16
+; GFX12-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX12-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX12-NEXT:    s_bitset0_b32 s1, s2
+; GFX12-NEXT:    s_max_u32 s0, s0, s3
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX12-NEXT:  ; %bb.2:
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, foo@gotpcrel32@lo+12
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, foo@gotpcrel32@hi+24
+; GFX12-NEXT:    s_mov_b32 s4, s32
+; GFX12-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
+; GFX12-NEXT:    v_lshl_add_u32 v0, s0, 5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xf1ff
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    scratch_store_b32 off, v40, s4
+; GFX12-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_call_and_alloca_var:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u32 v0, v8, 2, 15
+; GFX942-NEXT:    v_and_b32_e32 v0, -16, v0
+; GFX942-NEXT:    v_mov_b32_e32 v40, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s32, 16
+; GFX942-NEXT:  .LBB8_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX942-NEXT:    s_bitset0_b64 s[0:1], s3
+; GFX942-NEXT:    s_max_u32 s2, s2, s4
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, foo@gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, foo@gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b32 s4, s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_lshl_add_u32 v0, s2, 6, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s32, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    scratch_store_dword off, v40, s4
+; GFX942-NEXT:    s_endpgm
+  %v = alloca i32, i32 %count, align 4, addrspace(5)
+  call amdgpu_gfx void @foo()
+  store i32 0, ptr addrspace(5) %v, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index f6ae516..89d0394 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -1489,7 +1489,7 @@ attributes #2 = { noinline }
 !0 = !{float 3.0}
 ;.
 ; CHECK: attributes #[[ATTR0]] = { strictfp }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) }
 ; CHECK: attributes #[[ATTR2:[0-9]+]] = { nounwind memory(read) }
 ; CHECK: attributes #[[ATTR3]] = { noinline }
 ; CHECK: attributes #[[ATTR4]] = { nobuiltin }
diff --git a/llvm/test/CodeGen/AMDGPU/callbr.ll b/llvm/test/CodeGen/AMDGPU/callbr.ll
new file mode 100644
index 0000000..253a6ec
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/callbr.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck %s
+
+define void @callbr_inline_asm(ptr %src, ptr %dst1, ptr %dst2, i32 %c) {
+; CHECK-LABEL: callbr_inline_asm:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_load_dword v0, v[0:1]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    v_cmp_gt_i32 vcc v6, 42; s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %fallthrough
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[2:3], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; CHECK-NEXT:  .LBB0_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %indirect
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    flat_store_dword v[4:5], v0
+; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+	%a = load i32, ptr %src, align 4
+	callbr void asm "v_cmp_gt_i32 vcc $0, 42; s_cbranch_vccnz ${1:l}", "r,!i"(i32 %c) to label %fallthrough [label %indirect]
+fallthrough:
+	store i32 %a, ptr %dst1, align 4
+	br label %ret
+indirect:
+	store i32 %a, ptr %dst2, align 4
+	br label %ret
+ret:
+	ret void
+}
+
+define void @callbr_self_loop(i1 %c) {
+; CHECK-LABEL: callbr_self_loop:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:  .LBB1_1: ; %callbr
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_1
+; CHECK-NEXT:  .LBB1_2: ; Inline asm indirect target
+; CHECK-NEXT:    ; %callbr.target.ret
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  br label %callbr
+callbr:
+  callbr void asm "", "!i"() to label %callbr [label %ret]
+ret:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
index 007e3f0..076a99f 100644
--- a/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
+++ b/llvm/test/CodeGen/AMDGPU/do-not-unify-divergent-exit-nodes-with-musttail.ll
@@ -3,6 +3,7 @@
 
 declare void @foo(ptr)
 declare i1 @bar(ptr)
+declare i32 @bar32(ptr)
 
 define void @musttail_call_without_return_value(ptr %p) {
 ; CHECK-LABEL: define void @musttail_call_without_return_value(
@@ -28,6 +29,31 @@ bb.1:
   ret void
 }
 
+define void @musttail_call_without_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define void @musttail_call_without_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    musttail call void @foo(ptr [[P]])
+; CHECK-NEXT:    ret void
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  musttail call void @foo(ptr %p)
+  ret void
+
+bb.1:
+  ret void
+}
+
 define i1 @musttail_call_with_return_value(ptr %p) {
 ; CHECK-LABEL: define i1 @musttail_call_with_return_value(
 ; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
@@ -51,3 +77,28 @@ bb.0:
 bb.1:
   ret i1 %load
 }
+
+define i32 @musttail_call_with_return_value_callbr(ptr %p) {
+; CHECK-LABEL: define i32 @musttail_call_with_return_value_callbr(
+; CHECK-SAME: ptr [[P:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P]], align 1
+; CHECK-NEXT:    callbr void asm "", "r,!i"(i32 [[LOAD]])
+; CHECK-NEXT:            to label %[[BB_0:.*]] [label %bb.1]
+; CHECK:       [[BB_0]]:
+; CHECK-NEXT:    [[RET:%.*]] = musttail call i32 @bar32(ptr [[P]])
+; CHECK-NEXT:    ret i32 [[RET]]
+; CHECK:       [[BB_1:.*:]]
+; CHECK-NEXT:    ret i32 [[LOAD]]
+;
+entry:
+  %load = load i32, ptr %p, align 1
+  callbr void asm "", "r,!i"(i32 %load) to label %bb.0 [label %bb.1]
+
+bb.0:
+  %ret = musttail call i32 @bar32(ptr %p)
+  ret i32 %ret
+
+bb.1:
+  ret i32 %load
+}
diff --git a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e2e43f..df63592 100644
--- a/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -36,26 +36,60 @@ loop:
   br label %loop
 }
 
+define amdgpu_kernel void @infinite_loop_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP:%.*]] []
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", ""() to label %loop []
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+}
+
 define amdgpu_kernel void @infinite_loop_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB1_3
+; SI-NEXT:    s_cbranch_execz .LBB2_3
 ; SI-NEXT:  ; %bb.1: ; %loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB1_2: ; %loop
+; SI-NEXT:  .LBB2_2: ; %loop
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB1_2
-; SI-NEXT:  .LBB1_3: ; %UnifiedReturnBlock
+; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:  .LBB2_3: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_ret(
 ; IR-NEXT:  entry:
@@ -81,44 +115,93 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:  .LBB3_2: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND:%.*]] = icmp eq i32 [[TMP]], 1
+; IR-NEXT:    [[COND32:%.*]] = zext i1 [[COND]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND32]])
+; IR-NEXT:            to label [[LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP]] []
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond = icmp eq i32 %tmp, 1
+  %cond32 = zext i1 %cond to i32
+  callbr void asm "", "r,!i"(i32 %cond32) to label %loop [label %return]
+
+loop:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop []
+
+return:
+  ret void
+}
+
 define amdgpu_kernel void @infinite_loops(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loops:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    s_cbranch_scc1 .LBB2_4
+; SI-NEXT:    s_cbranch_scc1 .LBB4_4
 ; SI-NEXT:  ; %bb.1:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x378
 ; SI-NEXT:    s_and_b64 vcc, exec, -1
-; SI-NEXT:  .LBB2_2: ; %loop2
+; SI-NEXT:  .LBB4_2: ; %loop2
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccnz .LBB2_2
+; SI-NEXT:    s_cbranch_vccnz .LBB4_2
 ; SI-NEXT:  ; %bb.3: ; %Flow
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB2_4: ; %Flow2
+; SI-NEXT:  .LBB4_4: ; %Flow2
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_7
+; SI-NEXT:    s_cbranch_vccz .LBB4_7
 ; SI-NEXT:  ; %bb.5:
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
 ; SI-NEXT:    s_and_b64 vcc, exec, 0
-; SI-NEXT:  .LBB2_6: ; %loop1
+; SI-NEXT:  .LBB4_6: ; %loop1
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b64 vcc, vcc
-; SI-NEXT:    s_cbranch_vccz .LBB2_6
-; SI-NEXT:  .LBB2_7: ; %DummyReturnBlock
+; SI-NEXT:    s_cbranch_vccz .LBB4_6
+; SI-NEXT:  .LBB4_7: ; %DummyReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loops(
 ; IR-NEXT:  entry:
@@ -144,24 +227,78 @@ loop2:
   br label %loop2
 }
 
+define amdgpu_kernel void @infinite_loops_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loops_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %loop1
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; SI-NEXT:  .LBB5_2: ; Inline asm indirect target
+; SI-NEXT:    ; %loop2.preheader
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x378
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loops_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 poison)
+; IR-NEXT:            to label [[LOOP1:%.*]] [label %loop2]
+; IR:       loop1:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP1]] []
+; IR:       loop2:
+; IR-NEXT:    store volatile i32 888, ptr addrspace(1) [[OUT]], align 4
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK1:%.*]], label [[DUMMYRETURNBLOCK]]
+; IR:       TransitionBlock1:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[LOOP2:%.*]] []
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  callbr void asm "", "r,!i"(i32 poison) to label %loop1 [label %loop2]
+
+loop1:
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop1 []
+
+loop2:
+  store volatile i32 888, ptr addrspace(1) %out, align 4
+  callbr void asm "", ""() to label %loop2 []
+}
+
 define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-LABEL: infinite_loop_nest_ret:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
 ; SI-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; SI-NEXT:    s_cbranch_execz .LBB3_5
+; SI-NEXT:    s_cbranch_execz .LBB6_5
 ; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; SI-NEXT:    v_cmp_ne_u32_e64 s[0:1], 3, v0
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
-; SI-NEXT:  .LBB3_2: ; %outer_loop
+; SI-NEXT:  .LBB6_2: ; %outer_loop
 ; SI-NEXT:    ; =>This Loop Header: Depth=1
-; SI-NEXT:    ; Child Loop BB3_3 Depth 2
+; SI-NEXT:    ; Child Loop BB6_3 Depth 2
 ; SI-NEXT:    s_mov_b64 s[2:3], 0
-; SI-NEXT:  .LBB3_3: ; %inner_loop
-; SI-NEXT:    ; Parent Loop BB3_2 Depth=1
+; SI-NEXT:  .LBB6_3: ; %inner_loop
+; SI-NEXT:    ; Parent Loop BB6_2 Depth=1
 ; SI-NEXT:    ; => This Inner Loop Header: Depth=2
 ; SI-NEXT:    s_and_b64 s[8:9], exec, s[0:1]
 ; SI-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
@@ -169,13 +306,13 @@ define amdgpu_kernel void @infinite_loop_nest_ret(ptr addrspace(1) %out) {
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[2:3]
-; SI-NEXT:    s_cbranch_execnz .LBB3_3
+; SI-NEXT:    s_cbranch_execnz .LBB6_3
 ; SI-NEXT:  ; %bb.4: ; %loop.exit.guard
-; SI-NEXT:    ; in Loop: Header=BB3_2 Depth=1
+; SI-NEXT:    ; in Loop: Header=BB6_2 Depth=1
 ; SI-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; SI-NEXT:    s_mov_b64 vcc, 0
-; SI-NEXT:    s_branch .LBB3_2
-; SI-NEXT:  .LBB3_5: ; %UnifiedReturnBlock
+; SI-NEXT:    s_branch .LBB6_2
+; SI-NEXT:  .LBB6_5: ; %UnifiedReturnBlock
 ; SI-NEXT:    s_endpgm
 ; IR-LABEL: @infinite_loop_nest_ret(
 ; IR-NEXT:  entry:
@@ -212,4 +349,82 @@ return:
   ret void
 }
 
+define amdgpu_kernel void @infinite_loop_nest_ret_callbr(ptr addrspace(1) %out) {
+; SI-LABEL: infinite_loop_nest_ret_callbr:
+; SI:       ; %bb.0: ; %entry
+; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:  ; %bb.1: ; %outer_loop.preheader
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3e7
+; SI-NEXT:    s_and_b64 s[0:1], exec, 0
+; SI-NEXT:    s_branch .LBB7_3
+; SI-NEXT:  .LBB7_2: ; %loop.exit.guard
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    s_cbranch_vccnz .LBB7_5
+; SI-NEXT:  .LBB7_3: ; %outer_loop
+; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
+; SI-NEXT:    ;;#ASMSTART
+; SI-NEXT:    ;;#ASMEND
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    s_mov_b64 s[2:3], -1
+; SI-NEXT:    s_mov_b64 vcc, s[0:1]
+; SI-NEXT:    s_cbranch_vccz .LBB7_2
+; SI-NEXT:  ; %bb.4: ; %TransitionBlock.target.outer_loop
+; SI-NEXT:    ; in Loop: Header=BB7_3 Depth=1
+; SI-NEXT:    s_mov_b64 s[2:3], 0
+; SI-NEXT:    s_branch .LBB7_2
+; SI-NEXT:  .LBB7_5: ; Inline asm indirect target
+; SI-NEXT:    ; %UnifiedReturnBlock
+; SI-NEXT:    ; Label of block must be emitted
+; SI-NEXT:    s_endpgm
+; IR-LABEL: @infinite_loop_nest_ret_callbr(
+; IR-NEXT:  entry:
+; IR-NEXT:    [[TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x()
+; IR-NEXT:    [[COND1:%.*]] = icmp ne i32 [[TMP]], 1
+; IR-NEXT:    [[COND1_32:%.*]] = zext i1 [[COND1]] to i32
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND1_32]])
+; IR-NEXT:            to label [[OUTER_LOOP:%.*]] [label %UnifiedReturnBlock]
+; IR:       outer_loop:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[INNER_LOOP:%.*]] []
+; IR:       inner_loop:
+; IR-NEXT:    store volatile i32 999, ptr addrspace(1) [[OUT:%.*]], align 4
+; IR-NEXT:    [[COND3:%.*]] = icmp eq i32 [[TMP]], 3
+; IR-NEXT:    [[COND3_32:%.*]] = zext i1 [[COND3]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK:%.*]], label [[UNIFIEDRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[COND3_32]])
+; IR-NEXT:            to label [[INNER_LOOP]] [label %outer_loop]
+; IR:       UnifiedReturnBlock:
+; IR-NEXT:    ret void
+;
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %cond1 = icmp ne i32 %tmp, 1  ; avoid following BB optimizing away through the domination
+  %cond1_32 = zext i1 %cond1 to i32
+  callbr void asm "", "r,!i"(i32 %cond1_32) to label %outer_loop [label %return]
+
+outer_loop:
+  ; %cond2 = icmp eq i32 %tmp, 2
+  ; br i1 %cond2, label %outer_loop, label %inner_loop
+  callbr void asm "", ""() to label %inner_loop []
+
+inner_loop:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 999, ptr addrspace(1) %out, align 4
+  %cond3 = icmp eq i32 %tmp, 3
+  %cond3_32 = zext i1 %cond3 to i32
+  callbr void asm "", "r,!i"(i32 %cond3_32) to label %inner_loop [label %outer_loop]
+
+return:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/private-function.ll b/llvm/test/CodeGen/AMDGPU/private-function.ll
new file mode 100644
index 0000000..8eefc9d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/private-function.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define private void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_wait_loadcnt_dscnt 0x0
+; CHECK-NEXT:    s_wait_expcnt 0x0
+; CHECK-NEXT:    s_wait_samplecnt 0x0
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    s_wait_kmcnt 0x0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  ret void
+}
+
+@var = global ptr @foo
diff --git a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
index 002d43f..1316569 100644
--- a/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
+++ b/llvm/test/CodeGen/AMDGPU/set-gpr-idx-peephole.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs -o - %s -debugify-and-strip-all-safe | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -passes si-pre-emit-peephole -o - %s | FileCheck -check-prefix=GCN %s -implicit-check-not=S_SET_GPR_IDX
 
 ---
@@ -41,6 +42,27 @@ body:             |
 ...
 
 ---
+name:            meta_in_between
+body:             |
+  bb.0:
+    ; GCN-LABEL: name: meta_in_between
+    ; GCN: S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+    ; GCN-NEXT: $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: KILL $sgpr0
+    ; GCN-NEXT: $sgpr0 = IMPLICIT_DEF
+    ; GCN-NEXT: $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+    ; GCN-NEXT: S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  S_SET_GPR_IDX_ON $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr16 = V_MOV_B32_indirect_read undef $vgpr1, implicit $exec, implicit $m0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+  KILL $sgpr0
+  $sgpr0 = IMPLICIT_DEF
+  S_SET_GPR_IDX_ON killed $sgpr2, 1, implicit-def $m0, implicit-def $mode, implicit undef $m0, implicit $mode
+  $vgpr15 = V_MOV_B32_indirect_read undef $vgpr0, implicit $exec, implicit $m0, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
+  S_SET_GPR_IDX_OFF implicit-def $mode, implicit $mode
+...
+
+---
 name:            valu_write_in_between
 body:             |
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
index 34de1e4..01bcdad 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll
@@ -3,15 +3,16 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa %s -o - | FileCheck %s --check-prefix=ISA
 
 define void @nested_inf_loop(i1 %0, i1 %1) {
-; OPT-LABEL: @nested_inf_loop(
-; OPT-NEXT:  BB:
-; OPT-NEXT:    br label [[BB1:%.*]]
-; OPT:       BB1:
-; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0:%.*]], i1 true, i1 [[TMP1:%.*]]
-; OPT-NEXT:    br i1 [[BRMERGE]], label [[BB1]], label [[INFLOOP:%.*]]
-; OPT:       infloop:
-; OPT-NEXT:    br i1 true, label [[INFLOOP]], label [[DUMMYRETURNBLOCK:%.*]]
-; OPT:       DummyReturnBlock:
+; OPT-LABEL: define void @nested_inf_loop(
+; OPT-SAME: i1 [[TMP0:%.*]], i1 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    br label %[[BB1:.*]]
+; OPT:       [[BB1]]:
+; OPT-NEXT:    [[BRMERGE:%.*]] = select i1 [[TMP0]], i1 true, i1 [[TMP1]]
+; OPT-NEXT:    br i1 [[BRMERGE]], label %[[BB1]], label %[[INFLOOP:.*]]
+; OPT:       [[INFLOOP]]:
+; OPT-NEXT:    br i1 true, label %[[INFLOOP]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[DUMMYRETURNBLOCK]]:
 ; OPT-NEXT:    ret void
 ;
 ; ISA-LABEL: nested_inf_loop:
@@ -63,3 +64,84 @@ BB4:
 BB3:
   br label %BB1
 }
+
+define void @nested_inf_loop_callbr(i32 %0, i32 %1) {
+; OPT-LABEL: define void @nested_inf_loop_callbr(
+; OPT-SAME: i32 [[TMP0:%.*]], i32 [[TMP1:%.*]]) {
+; OPT-NEXT:  [[BB:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1:.*]] []
+; OPT:       [[BB1]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP0]])
+; OPT-NEXT:            to label %[[BB3:.*]] [label %BB2]
+; OPT:       [[BB2:.*:]]
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB4:.*]] []
+; OPT:       [[BB4]]:
+; OPT-NEXT:    br i1 true, label %[[TRANSITIONBLOCK:.*]], label %[[DUMMYRETURNBLOCK:.*]]
+; OPT:       [[TRANSITIONBLOCK]]:
+; OPT-NEXT:    callbr void asm "", "r,!i"(i32 [[TMP1]])
+; OPT-NEXT:            to label %[[BB3]] [label %BB4]
+; OPT:       [[BB3]]:
+; OPT-NEXT:    callbr void asm "", ""()
+; OPT-NEXT:            to label %[[BB1]] []
+; OPT:       [[DUMMYRETURNBLOCK]]:
+; OPT-NEXT:    ret void
+;
+; ISA-LABEL: nested_inf_loop_callbr:
+; ISA:       ; %bb.0: ; %BB
+; ISA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    ; implicit-def: $sgpr6_sgpr7
+; ISA-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; ISA-NEXT:  .LBB1_1: ; %BB1
+; ISA-NEXT:    ; =>This Inner Loop Header: Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[6:7], s[6:7], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; ISA-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
+; ISA-NEXT:  .LBB1_2: ; %BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
+; ISA-NEXT:    s_and_b64 s[8:9], s[6:7], exec
+; ISA-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; ISA-NEXT:    s_branch .LBB1_1
+; ISA-NEXT:  .LBB1_3: ; Inline asm indirect target
+; ISA-NEXT:    ; %BB2
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    ; Label of block must be emitted
+; ISA-NEXT:    ;;#ASMSTART
+; ISA-NEXT:    ;;#ASMEND
+; ISA-NEXT:    s_mov_b64 s[6:7], -1
+; ISA-NEXT:    s_and_saveexec_b64 s[8:9], s[4:5]
+; ISA-NEXT:    s_cbranch_execz .LBB1_5
+; ISA-NEXT:  ; %bb.4: ; %TransitionBlock.target.BB3
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_xor_b64 s[6:7], exec, -1
+; ISA-NEXT:  .LBB1_5: ; %loop.exit.guard
+; ISA-NEXT:    ; in Loop: Header=BB1_1 Depth=1
+; ISA-NEXT:    s_or_b64 exec, exec, s[8:9]
+; ISA-NEXT:    s_and_b64 vcc, exec, s[6:7]
+; ISA-NEXT:    s_mov_b64 s[6:7], 0
+; ISA-NEXT:    s_cbranch_vccz .LBB1_2
+; ISA-NEXT:  ; %bb.6: ; %DummyReturnBlock
+; ISA-NEXT:    s_setpc_b64 s[30:31]
+BB:
+  callbr void asm "", ""() to label %BB1 []
+
+BB1:
+  callbr void asm "", "r,!i"(i32 %0) to label %BB3 [label %BB2]
+
+BB2:
+  callbr void asm "", ""() to label %BB4 []
+
+BB4:
+  callbr void asm "", "r,!i"(i32 %1) to label %BB3 [label %BB4]
+
+BB3:
+  callbr void asm "", ""() to label %BB1 []
+}
diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
index 4cbe682..004c279 100644
--- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-multiple-unreachables.ll
@@ -1,5 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -stop-after=amdgpu-unify-divergent-exit-nodes | FileCheck %s --check-prefix=UNIFY
 ; RUN: llc < %s -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 | FileCheck %s
 
 declare void @llvm.trap()
@@ -70,8 +70,33 @@ define amdgpu_kernel void @kernel(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
 ; CHECK-NEXT:    s_mov_b64 s[2:3], -1
 ; CHECK-NEXT:    s_trap 2
 ; CHECK-NEXT:    s_branch .LBB0_4
-
-
+; UNIFY-LABEL: @kernel(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    br i1 [[CMP1]], label [[IF_END6_SINK_SPLIT:%.*]], label [[COND_FALSE:%.*]]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    br i1 [[CMP2]], label [[IF_THEN3:%.*]], label [[IF_END6:%.*]]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    br i1 [[CMP1_I7]], label [[IF_END6_SINK_SPLIT]], label [[COND_FALSE_I8:%.*]]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    br label [[IF_END6]]
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp = icmp eq i32 %n, 256
@@ -105,5 +130,129 @@ if.end6.sink.split:
 if.end6:
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; UNIFY: {{.*}}
+
+define amdgpu_kernel void @kernel_callbr(i32 %a, ptr addrspace(1) %x, i32 noundef %n) {
+; CHECK-LABEL: kernel_callbr:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dword s1, s[8:9], 0x10
+; CHECK-NEXT:    s_load_dword s0, s[8:9], 0x0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_cmpk_eq_i32 s1, 0x100
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.1: ; %if.then
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_2: ; %if.end6.sink.split
+; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[8:9], 0x8
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    global_store_dword v0, v1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  .LBB1_3: ; Inline asm indirect target
+; CHECK-NEXT:    ; %UnifiedReturnBlock
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_endpgm
+; CHECK-NEXT:  .LBB1_4: ; Inline asm indirect target
+; CHECK-NEXT:    ; %if.else
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 10, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:  ; %bb.5: ; %if.then3
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; CHECK-NEXT:    ;;#ASMSTART
+; CHECK-NEXT:    ;;#ASMEND
+; CHECK-NEXT:    s_branch .LBB1_2
+; CHECK-NEXT:  .LBB1_6: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false.i8
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:  .LBB1_7: ; Inline asm indirect target
+; CHECK-NEXT:    ; %cond.false
+; CHECK-NEXT:    ; Label of block must be emitted
+; CHECK-NEXT:    s_trap 2
+; CHECK-NEXT:    ; divergent unreachable
+; CHECK-NEXT:    s_branch .LBB1_3
+; UNIFY-LABEL: @kernel_callbr(
+; UNIFY-NEXT:  entry:
+; UNIFY-NEXT:    [[TID:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; UNIFY-NEXT:    [[CMP:%.*]] = icmp eq i32 [[N:%.*]], 256
+; UNIFY-NEXT:    [[CMP32:%.*]] = zext i1 [[CMP]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP32]])
+; UNIFY-NEXT:            to label [[IF_THEN:%.*]] [label %if.else]
+; UNIFY:       if.then:
+; UNIFY-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; UNIFY-NEXT:    [[CMP1_32:%.*]] = zext i1 [[CMP1]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT:%.*]] [label %cond.false]
+; UNIFY:       cond.false:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.else:
+; UNIFY-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[TID]], 10
+; UNIFY-NEXT:    [[CMP2_32:%.*]] = zext i1 [[CMP2]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP2_32]])
+; UNIFY-NEXT:            to label [[IF_THEN3:%.*]] [label %if.end6]
+; UNIFY:       if.then3:
+; UNIFY-NEXT:    [[CMP1_I7:%.*]] = icmp eq i32 [[A]], 0
+; UNIFY-NEXT:    [[CMP1_I7_32:%.*]] = zext i1 [[CMP1_I7]] to i32
+; UNIFY-NEXT:    callbr void asm "", "r,!i"(i32 [[CMP1_I7_32]])
+; UNIFY-NEXT:            to label [[IF_END6_SINK_SPLIT]] [label %cond.false.i8]
+; UNIFY:       cond.false.i8:
+; UNIFY-NEXT:    call void @llvm.trap()
+; UNIFY-NEXT:    unreachable
+; UNIFY:       if.end6.sink.split:
+; UNIFY-NEXT:    [[X1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[X:%.*]], i32 [[TID]]
+; UNIFY-NEXT:    store i32 [[A]], ptr addrspace(1) [[X1]], align 4
+; UNIFY-NEXT:    callbr void asm "", ""()
+; UNIFY-NEXT:            to label [[IF_END6:%.*]] []
+; UNIFY:       if.end6:
+; UNIFY-NEXT:    ret void
+;
+entry:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %cmp = icmp eq i32 %n, 256
+  %cmp32 = zext i1 %cmp to i32
+  callbr void asm "", "r,!i"(i32 %cmp32) to label %if.then [label %if.else]
+
+if.then:
+  %cmp1 = icmp eq i32 %a, 0
+  %cmp1_32 = zext i1 %cmp1 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1_32) to label %if.end6.sink.split [label %cond.false]
+
+cond.false:
+  call void @llvm.trap()
+  unreachable
+
+if.else:
+  %cmp2 = icmp ult i32 %tid, 10
+  %cmp2_32 = zext i1 %cmp2 to i32
+  callbr void asm "", "r,!i"(i32 %cmp2_32) to label %if.then3 [label %if.end6]
+
+if.then3:
+  %cmp1.i7 = icmp eq i32 %a, 0
+  %cmp1.i7_32 = zext i1 %cmp1.i7 to i32
+  callbr void asm "", "r,!i"(i32 %cmp1.i7_32) to label %if.end6.sink.split [label %cond.false.i8]
+
+cond.false.i8:
+  call void @llvm.trap()
+  unreachable
+
+if.end6.sink.split:
+  %x1 = getelementptr inbounds i32, ptr addrspace(1) %x, i32 %tid
+  store i32 %a, ptr addrspace(1) %x1, align 4
+  callbr void asm "", ""() to label %if.end6 []
+
+if.end6:
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/update-phi.ll b/llvm/test/CodeGen/AMDGPU/update-phi.ll
index 50666be..684dc1a 100644
--- a/llvm/test/CodeGen/AMDGPU/update-phi.ll
+++ b/llvm/test/CodeGen/AMDGPU/update-phi.ll
@@ -37,3 +37,42 @@ n28:                                               ; preds = %.loopexit, %n28
 n31:                                               ; preds =
   ret void
 }
+
+define amdgpu_ps void @_amdgpu_ps_main_callbr() local_unnamed_addr #3 {
+; IR-LABEL: @_amdgpu_ps_main_callbr(
+; IR-NEXT:  .entry:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[DOTLOOPEXIT:%.*]] []
+; IR:       .loopexit:
+; IR-NEXT:    callbr void asm "", ""()
+; IR-NEXT:            to label [[N28:%.*]] []
+; IR:       n28:
+; IR-NEXT:    [[DOT01:%.*]] = phi float [ 0.000000e+00, [[DOTLOOPEXIT]] ], [ [[N29:%.*]], [[TRANSITIONBLOCK:%.*]] ]
+; IR-NEXT:    [[N29]] = fadd float [[DOT01]], 1.000000e+00
+; IR-NEXT:    [[N30:%.*]] = fcmp ogt float [[N29]], 4.000000e+00
+; IR-NEXT:    [[N30_32:%.*]] = zext i1 [[N30]] to i32
+; IR-NEXT:    br i1 true, label [[TRANSITIONBLOCK]], label [[DUMMYRETURNBLOCK:%.*]]
+; IR:       TransitionBlock:
+; IR-NEXT:    callbr void asm "", "r,!i"(i32 [[N30_32]])
+; IR-NEXT:            to label [[DOTLOOPEXIT]] [label %n28]
+; IR:       n31:
+; IR-NEXT:    ret void
+; IR:       DummyReturnBlock:
+; IR-NEXT:    ret void
+;
+.entry:
+  callbr void asm "", ""() to label %.loopexit []
+
+.loopexit:                                        ; preds = %n28, %.entry
+  callbr void asm "", ""() to label %n28 []
+
+n28:                                               ; preds = %.loopexit, %n28
+  %.01 = phi float [ 0.000000e+00, %.loopexit ], [ %n29, %n28 ]
+  %n29 = fadd float %.01, 1.0
+  %n30 = fcmp ogt float %n29, 4.000000e+00
+  %n30.32 = zext i1 %n30 to i32
+  callbr void asm "", "r,!i"(i32 %n30.32) to label %.loopexit [label %n28]
+
+n31:                                               ; preds =
+  ret void
+}
diff --git a/llvm/test/CodeGen/ARM/llvm.sincos.ll b/llvm/test/CodeGen/ARM/llvm.sincos.ll
index 9628405..1448fac 100644
--- a/llvm/test/CodeGen/ARM/llvm.sincos.ll
+++ b/llvm/test/CodeGen/ARM/llvm.sincos.ll
@@ -1,223 +1,1004 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=thumbv7-gnu-linux < %s | FileCheck -check-prefix=GNU %s
+; RUN: llc -mtriple=armv7-linux-gnueabi -mcpu=cortex-a8 < %s | FileCheck -check-prefix=GNUEABI %s
+; RUN: llc -mtriple=armv7-apple-ios6 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-NO-STRET %s
+; RUN: llc -mtriple=armv7-apple-ios7 -mcpu=cortex-a8 < %s | FileCheck -check-prefixes=IOS,IOS-WITH-STRET %s
+; RUN: llc -mtriple=thumbv7k-apple-watchos2.0 < %s | FileCheck -check-prefix=WATCHABI %s
 
 define { half, half } @test_sincos_f16(half %a) {
-; CHECK-LABEL: test_sincos_f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    mov r1, r0
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    mov r1, r0
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    mov r1, r0
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r4}
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r5, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s1, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   ret { half, half } %result
 }
 
 define half @test_sincos_f16_only_use_sin(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_sin:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #4]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_sin:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #4]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_sin:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_sin:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_sin:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.0 = extractvalue { half, half } %result, 0
   ret half %result.0
 }
 
 define half @test_sincos_f16_only_use_cos(half %a) {
-; CHECK-LABEL: test_sincos_f16_only_use_cos:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    add sp, #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f16_only_use_cos:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    add sp, #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f16_only_use_cos:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    add sp, sp, #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {lr}
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    pop {lr}
+; IOS-NO-STRET-NEXT:    bx lr
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f16_only_use_cos:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #4]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    add sp, sp, #8
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f16_only_use_cos:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s0
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { half, half } @llvm.sincos.f16(half %a)
   %result.1 = extractvalue { half, half } %result, 1
   ret half %result.1
 }
 
 define { <2 x half>, <2 x half> } @test_sincos_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_sincos_v2f16:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #24
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    mov r0, r4
-; CHECK-NEXT:    bl __gnu_h2f_ieee
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldr r0, [sp, #12]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp, #4]
-; CHECK-NEXT:    strh.w r0, [sp, #22]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #20]
-; CHECK-NEXT:    add r0, sp, #20
-; CHECK-NEXT:    vld1.32 {d8[0]}, [r0:32]
-; CHECK-NEXT:    ldr r0, [sp, #8]
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    ldr r1, [sp]
-; CHECK-NEXT:    strh.w r0, [sp, #18]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    bl __gnu_f2h_ieee
-; CHECK-NEXT:    strh.w r0, [sp, #16]
-; CHECK-NEXT:    add r0, sp, #16
-; CHECK-NEXT:    vmovl.u16 q9, d8
-; CHECK-NEXT:    vld1.32 {d16[0]}, [r0:32]
-; CHECK-NEXT:    vmovl.u16 q8, d16
-; CHECK-NEXT:    vmov.32 r0, d18[0]
-; CHECK-NEXT:    vmov.32 r1, d18[1]
-; CHECK-NEXT:    vmov.32 r2, d16[0]
-; CHECK-NEXT:    vmov.32 r3, d16[1]
-; CHECK-NEXT:    add sp, #24
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f16:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #24
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    mov r0, r4
+; GNU-NEXT:    bl __gnu_h2f_ieee
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldr r0, [sp, #12]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp, #4]
+; GNU-NEXT:    strh.w r0, [sp, #22]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #20]
+; GNU-NEXT:    add r0, sp, #20
+; GNU-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNU-NEXT:    ldr r0, [sp, #8]
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    ldr r1, [sp]
+; GNU-NEXT:    strh.w r0, [sp, #18]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    bl __gnu_f2h_ieee
+; GNU-NEXT:    strh.w r0, [sp, #16]
+; GNU-NEXT:    add r0, sp, #16
+; GNU-NEXT:    vmovl.u16 q9, d8
+; GNU-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNU-NEXT:    vmovl.u16 q8, d16
+; GNU-NEXT:    vmov.32 r0, d18[0]
+; GNU-NEXT:    vmov.32 r1, d18[1]
+; GNU-NEXT:    vmov.32 r2, d16[0]
+; GNU-NEXT:    vmov.32 r3, d16[1]
+; GNU-NEXT:    add sp, #24
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f16:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #24
+; GNUEABI-NEXT:    sub sp, sp, #24
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    mov r0, r4
+; GNUEABI-NEXT:    bl __gnu_h2f_ieee
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #12]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp, #4]
+; GNUEABI-NEXT:    strh r0, [sp, #22]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #20]
+; GNUEABI-NEXT:    add r0, sp, #20
+; GNUEABI-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    ldr r1, [sp]
+; GNUEABI-NEXT:    strh r0, [sp, #18]
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    bl __gnu_f2h_ieee
+; GNUEABI-NEXT:    strh r0, [sp, #16]
+; GNUEABI-NEXT:    add r0, sp, #16
+; GNUEABI-NEXT:    vmovl.u16 q9, d8
+; GNUEABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; GNUEABI-NEXT:    vmovl.u16 q8, d16
+; GNUEABI-NEXT:    vmov.32 r0, d18[0]
+; GNUEABI-NEXT:    vmov.32 r1, d18[1]
+; GNUEABI-NEXT:    vmov.32 r2, d16[0]
+; GNUEABI-NEXT:    vmov.32 r3, d16[1]
+; GNUEABI-NEXT:    add sp, sp, #24
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f16:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    sub sp, sp, #8
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r1
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #6]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl ___extendhfsf2
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #4]
+; IOS-NO-STRET-NEXT:    add r0, sp, #4
+; IOS-NO-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp, #2]
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    bl ___truncsfhf2
+; IOS-NO-STRET-NEXT:    strh r0, [sp]
+; IOS-NO-STRET-NEXT:    mov r0, sp
+; IOS-NO-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-NO-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-NO-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-NO-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-NO-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-NO-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-NO-STRET-NEXT:    add sp, sp, #8
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f16:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #24
+; IOS-WITH-STRET-NEXT:    mov r4, r0
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___extendhfsf2
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    ldr r0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldr r4, [sp, #12]
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    ldm sp, {r1, r5}
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #22]
+; IOS-WITH-STRET-NEXT:    mov r0, r1
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #20]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #20
+; IOS-WITH-STRET-NEXT:    vld1.32 {d8[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    mov r0, r4
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #18]
+; IOS-WITH-STRET-NEXT:    mov r0, r5
+; IOS-WITH-STRET-NEXT:    bl ___truncsfhf2
+; IOS-WITH-STRET-NEXT:    strh r0, [sp, #16]
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q9, d8
+; IOS-WITH-STRET-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; IOS-WITH-STRET-NEXT:    vmovl.u16 q8, d16
+; IOS-WITH-STRET-NEXT:    vmov.32 r0, d18[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r1, d18[1]
+; IOS-WITH-STRET-NEXT:    vmov.32 r2, d16[0]
+; IOS-WITH-STRET-NEXT:    vmov.32 r3, d16[1]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #24
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f16:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    vpush {d8}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    vmov.f32 s16, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s0, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vcvtb.f32.f16 s4, s16
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s4
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #6]
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s0
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s20
+; WATCHABI-NEXT:    strh.w r0, [sp, #4]
+; WATCHABI-NEXT:    add r0, sp, #4
+; WATCHABI-NEXT:    vld1.32 {d16[0]}, [r0:32]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vcvtb.f16.f32 s0, s1
+; WATCHABI-NEXT:    strh.w r0, [sp, #2]
+; WATCHABI-NEXT:    vmov r0, s0
+; WATCHABI-NEXT:    vmovl.u16 q0, d16
+; WATCHABI-NEXT:    strh.w r0, [sp]
+; WATCHABI-NEXT:    mov r0, sp
+; WATCHABI-NEXT:    vld1.32 {d18[0]}, [r0:32]
+; WATCHABI-NEXT:    vmovl.u16 q1, d18
+; WATCHABI-NEXT:    vmov.f32 s2, s4
+; WATCHABI-NEXT:    vmov.f32 s3, s5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8}
+; WATCHABI-NEXT:    vpop {d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x half>, <2 x half> } @llvm.sincos.v2f16(<2 x half> %a)
   ret { <2 x half>, <2 x half> } %result
 }
 
 define { float, float } @test_sincos_f32(float %a) {
-; CHECK-LABEL: test_sincos_f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #8
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    ldrd r1, r0, [sp], #8
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #8
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    ldrd r1, r0, [sp], #8
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #8
+; GNUEABI-NEXT:    sub sp, sp, #8
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    ldr r0, [sp, #4]
+; GNUEABI-NEXT:    ldr r1, [sp], #8
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r1, r0
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    pop {r4, r5, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #8
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    pop {r0, r1}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { float, float } @llvm.sincos.f32(float %a)
   ret { float, float } %result
 }
 
 define { <2 x float>, <2 x float> } @test_sincos_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_sincos_v2f32:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    vpush {d8}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    vmov d8, r0, r1
-; CHECK-NEXT:    add r1, sp, #4
-; CHECK-NEXT:    mov r2, sp
-; CHECK-NEXT:    vmov r0, s17
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vmov r0, s16
-; CHECK-NEXT:    add r1, sp, #12
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    bl sincosf
-; CHECK-NEXT:    vldr s1, [sp, #4]
-; CHECK-NEXT:    vldr s3, [sp]
-; CHECK-NEXT:    vldr s0, [sp, #12]
-; CHECK-NEXT:    vldr s2, [sp, #8]
-; CHECK-NEXT:    vmov r0, r1, d0
-; CHECK-NEXT:    vmov r2, r3, d1
-; CHECK-NEXT:    add sp, #16
-; CHECK-NEXT:    vpop {d8}
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_v2f32:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    vpush {d8}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    vmov d8, r0, r1
+; GNU-NEXT:    add r1, sp, #4
+; GNU-NEXT:    mov r2, sp
+; GNU-NEXT:    vmov r0, s17
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vmov r0, s16
+; GNU-NEXT:    add r1, sp, #12
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    bl sincosf
+; GNU-NEXT:    vldr s1, [sp, #4]
+; GNU-NEXT:    vldr s3, [sp]
+; GNU-NEXT:    vldr s0, [sp, #12]
+; GNU-NEXT:    vldr s2, [sp, #8]
+; GNU-NEXT:    vmov r0, r1, d0
+; GNU-NEXT:    vmov r2, r3, d1
+; GNU-NEXT:    add sp, #16
+; GNU-NEXT:    vpop {d8}
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f32:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .vsave {d8}
+; GNUEABI-NEXT:    vpush {d8}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    vmov d8, r0, r1
+; GNUEABI-NEXT:    add r1, sp, #4
+; GNUEABI-NEXT:    mov r2, sp
+; GNUEABI-NEXT:    vmov r0, s17
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vmov r0, s16
+; GNUEABI-NEXT:    add r1, sp, #12
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    bl sincosf
+; GNUEABI-NEXT:    vldr s1, [sp, #4]
+; GNUEABI-NEXT:    vldr s3, [sp]
+; GNUEABI-NEXT:    vldr s0, [sp, #12]
+; GNUEABI-NEXT:    vldr s2, [sp, #8]
+; GNUEABI-NEXT:    vmov r0, r1, d0
+; GNUEABI-NEXT:    vmov r2, r3, d1
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    vpop {d8}
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f32:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8}
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vmov r4, s17
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    mov r0, r4
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    vmov r6, s16
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _sinf
+; IOS-NO-STRET-NEXT:    mov r7, r0
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    bl _cosf
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r5
+; IOS-NO-STRET-NEXT:    mov r3, r4
+; IOS-NO-STRET-NEXT:    vpop {d8}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f32:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    vpush {d8}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vmov d8, r0, r1
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    vmov r1, s17
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vmov r1, s16
+; IOS-WITH-STRET-NEXT:    add r0, sp, #8
+; IOS-WITH-STRET-NEXT:    bl ___sincosf_stret
+; IOS-WITH-STRET-NEXT:    vldr s1, [sp]
+; IOS-WITH-STRET-NEXT:    vldr s3, [sp, #4]
+; IOS-WITH-STRET-NEXT:    vldr s0, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vldr s2, [sp, #12]
+; IOS-WITH-STRET-NEXT:    vmov r0, r1, d0
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d1
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    vpop {d8}
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_v2f32:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    .cfi_offset d10, -16
+; WATCHABI-NEXT:    .cfi_offset d9, -24
+; WATCHABI-NEXT:    .cfi_offset d8, -32
+; WATCHABI-NEXT:    vmov.f64 d8, d0
+; WATCHABI-NEXT:    vmov.f32 s0, s17
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s19, s0
+; WATCHABI-NEXT:    vmov.f32 s0, s16
+; WATCHABI-NEXT:    vmov.f32 s21, s1
+; WATCHABI-NEXT:    bl ___sincosf_stret
+; WATCHABI-NEXT:    vmov.f32 s20, s1
+; WATCHABI-NEXT:    vmov.f32 s18, s0
+; WATCHABI-NEXT:    vmov.f64 d1, d10
+; WATCHABI-NEXT:    vmov.f64 d0, d9
+; WATCHABI-NEXT:    vpop {d8, d9, d10}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x float>, <2 x float> } @llvm.sincos.v2f32(<2 x float> %a)
   ret { <2 x float>, <2 x float> } %result
 }
 
 define { double, double } @test_sincos_f64(double %a) {
-; CHECK-LABEL: test_sincos_f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r7, lr}
-; CHECK-NEXT:    sub sp, #16
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #8]
-; CHECK-NEXT:    ldrd r2, r3, [sp], #16
-; CHECK-NEXT:    pop {r7, pc}
+; GNU-LABEL: test_sincos_f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r7, lr}
+; GNU-NEXT:    sub sp, #16
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #8]
+; GNU-NEXT:    ldrd r2, r3, [sp], #16
+; GNU-NEXT:    pop {r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r11, lr}
+; GNUEABI-NEXT:    push {r11, lr}
+; GNUEABI-NEXT:    .pad #16
+; GNUEABI-NEXT:    sub sp, sp, #16
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldm sp, {r2, r3}
+; GNUEABI-NEXT:    ldr r0, [sp, #8]
+; GNUEABI-NEXT:    ldr r1, [sp, #12]
+; GNUEABI-NEXT:    add sp, sp, #16
+; GNUEABI-NEXT:    pop {r11, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, lr}
+; IOS-NO-STRET-NEXT:    mov r4, r1
+; IOS-NO-STRET-NEXT:    mov r5, r0
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r6, r0
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r0, r5
+; IOS-NO-STRET-NEXT:    mov r1, r4
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    mov r2, r0
+; IOS-NO-STRET-NEXT:    mov r3, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r7
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r2, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r0
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp, #8]
+; IOS-WITH-STRET-NEXT:    ldm sp, {r0, r1}
+; IOS-WITH-STRET-NEXT:    vmov r2, r3, d16
+; IOS-WITH-STRET-NEXT:    add sp, sp, #16
+; IOS-WITH-STRET-NEXT:    pop {lr}
+; IOS-WITH-STRET-NEXT:    bx lr
+;
+; WATCHABI-LABEL: test_sincos_f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 16
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { double, double } @llvm.sincos.f64(double %a)
   ret { double, double } %result
 }
 
 define { <2 x double>, <2 x double> } @test_sincos_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_sincos_v2f64:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, lr}
-; CHECK-NEXT:    sub sp, #32
-; CHECK-NEXT:    mov r1, r3
-; CHECK-NEXT:    mov r12, r2
-; CHECK-NEXT:    add r2, sp, #24
-; CHECK-NEXT:    add r3, sp, #16
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    mov r0, r12
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    ldrd r0, r1, [sp, #40]
-; CHECK-NEXT:    add r2, sp, #8
-; CHECK-NEXT:    mov r3, sp
-; CHECK-NEXT:    bl sincos
-; CHECK-NEXT:    vldr d19, [sp, #8]
-; CHECK-NEXT:    vldr d18, [sp, #24]
-; CHECK-NEXT:    vldr d17, [sp]
-; CHECK-NEXT:    vldr d16, [sp, #16]
-; CHECK-NEXT:    vst1.64 {d18, d19}, [r4]!
-; CHECK-NEXT:    vst1.64 {d16, d17}, [r4]
-; CHECK-NEXT:    add sp, #32
-; CHECK-NEXT:    pop {r4, pc}
+; GNU-LABEL: test_sincos_v2f64:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, lr}
+; GNU-NEXT:    sub sp, #32
+; GNU-NEXT:    mov r1, r3
+; GNU-NEXT:    mov r12, r2
+; GNU-NEXT:    add r2, sp, #24
+; GNU-NEXT:    add r3, sp, #16
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    mov r0, r12
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    ldrd r0, r1, [sp, #40]
+; GNU-NEXT:    add r2, sp, #8
+; GNU-NEXT:    mov r3, sp
+; GNU-NEXT:    bl sincos
+; GNU-NEXT:    vldr d19, [sp, #8]
+; GNU-NEXT:    vldr d18, [sp, #24]
+; GNU-NEXT:    vldr d17, [sp]
+; GNU-NEXT:    vldr d16, [sp, #16]
+; GNU-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNU-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNU-NEXT:    add sp, #32
+; GNU-NEXT:    pop {r4, pc}
+;
+; GNUEABI-LABEL: test_sincos_v2f64:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, lr}
+; GNUEABI-NEXT:    push {r4, lr}
+; GNUEABI-NEXT:    .pad #32
+; GNUEABI-NEXT:    sub sp, sp, #32
+; GNUEABI-NEXT:    mov r1, r3
+; GNUEABI-NEXT:    mov r12, r2
+; GNUEABI-NEXT:    add r2, sp, #24
+; GNUEABI-NEXT:    add r3, sp, #16
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    mov r0, r12
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    ldr r0, [sp, #40]
+; GNUEABI-NEXT:    add r2, sp, #8
+; GNUEABI-NEXT:    ldr r1, [sp, #44]
+; GNUEABI-NEXT:    mov r3, sp
+; GNUEABI-NEXT:    bl sincos
+; GNUEABI-NEXT:    vldr d19, [sp, #8]
+; GNUEABI-NEXT:    vldr d18, [sp, #24]
+; GNUEABI-NEXT:    vldr d17, [sp]
+; GNUEABI-NEXT:    vldr d16, [sp, #16]
+; GNUEABI-NEXT:    vst1.64 {d18, d19}, [r4]!
+; GNUEABI-NEXT:    vst1.64 {d16, d17}, [r4]
+; GNUEABI-NEXT:    add sp, sp, #32
+; GNUEABI-NEXT:    pop {r4, pc}
+;
+; IOS-NO-STRET-LABEL: test_sincos_v2f64:
+; IOS-NO-STRET:       @ %bb.0:
+; IOS-NO-STRET-NEXT:    push {r4, r5, r6, r7, r8, r10, r11, lr}
+; IOS-NO-STRET-NEXT:    vpush {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    ldr r8, [sp, #64]
+; IOS-NO-STRET-NEXT:    mov r7, r1
+; IOS-NO-STRET-NEXT:    mov r4, r0
+; IOS-NO-STRET-NEXT:    mov r0, r3
+; IOS-NO-STRET-NEXT:    mov r6, r3
+; IOS-NO-STRET-NEXT:    mov r10, r2
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    mov r11, r0
+; IOS-NO-STRET-NEXT:    mov r5, r1
+; IOS-NO-STRET-NEXT:    mov r0, r6
+; IOS-NO-STRET-NEXT:    mov r1, r8
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d9, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    vmov d11, r11, r5
+; IOS-NO-STRET-NEXT:    bl _sin
+; IOS-NO-STRET-NEXT:    vmov d10, r0, r1
+; IOS-NO-STRET-NEXT:    mov r0, r7
+; IOS-NO-STRET-NEXT:    mov r1, r10
+; IOS-NO-STRET-NEXT:    bl _cos
+; IOS-NO-STRET-NEXT:    vmov d8, r0, r1
+; IOS-NO-STRET-NEXT:    vst1.32 {d10, d11}, [r4]!
+; IOS-NO-STRET-NEXT:    vst1.32 {d8, d9}, [r4]
+; IOS-NO-STRET-NEXT:    vpop {d8, d9, d10, d11}
+; IOS-NO-STRET-NEXT:    pop {r4, r5, r6, r7, r8, r10, r11, pc}
+;
+; IOS-WITH-STRET-LABEL: test_sincos_v2f64:
+; IOS-WITH-STRET:       @ %bb.0:
+; IOS-WITH-STRET-NEXT:    push {r4, r5, r6, lr}
+; IOS-WITH-STRET-NEXT:    sub sp, sp, #32
+; IOS-WITH-STRET-NEXT:    mov r4, r2
+; IOS-WITH-STRET-NEXT:    ldr r2, [sp, #48]
+; IOS-WITH-STRET-NEXT:    mov r6, r0
+; IOS-WITH-STRET-NEXT:    add r0, sp, #16
+; IOS-WITH-STRET-NEXT:    mov r5, r1
+; IOS-WITH-STRET-NEXT:    mov r1, r3
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    mov r0, sp
+; IOS-WITH-STRET-NEXT:    mov r1, r5
+; IOS-WITH-STRET-NEXT:    mov r2, r4
+; IOS-WITH-STRET-NEXT:    bl ___sincos_stret
+; IOS-WITH-STRET-NEXT:    vldr d17, [sp, #16]
+; IOS-WITH-STRET-NEXT:    vldr d16, [sp]
+; IOS-WITH-STRET-NEXT:    vldr d19, [sp, #24]
+; IOS-WITH-STRET-NEXT:    vldr d18, [sp, #8]
+; IOS-WITH-STRET-NEXT:    vst1.32 {d16, d17}, [r6]!
+; IOS-WITH-STRET-NEXT:    vst1.32 {d18, d19}, [r6]
+; IOS-WITH-STRET-NEXT:    add sp, sp, #32
+; IOS-WITH-STRET-NEXT:    pop {r4, r5, r6, pc}
+;
+; WATCHABI-LABEL: test_sincos_v2f64:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push {r7, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 8
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 56
+; WATCHABI-NEXT:    .cfi_offset d13, -16
+; WATCHABI-NEXT:    .cfi_offset d12, -24
+; WATCHABI-NEXT:    .cfi_offset d11, -32
+; WATCHABI-NEXT:    .cfi_offset d10, -40
+; WATCHABI-NEXT:    .cfi_offset d9, -48
+; WATCHABI-NEXT:    .cfi_offset d8, -56
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 64
+; WATCHABI-NEXT:    vorr q4, q0, q0
+; WATCHABI-NEXT:    vorr d0, d9, d9
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d11, d0, d0
+; WATCHABI-NEXT:    vorr d0, d8, d8
+; WATCHABI-NEXT:    vorr d13, d1, d1
+; WATCHABI-NEXT:    bl ___sincos_stret
+; WATCHABI-NEXT:    vorr d12, d1, d1
+; WATCHABI-NEXT:    vorr d10, d0, d0
+; WATCHABI-NEXT:    vorr q1, q6, q6
+; WATCHABI-NEXT:    vorr q0, q5, q5
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; WATCHABI-NEXT:    pop {r7, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { <2 x double>, <2 x double> } @llvm.sincos.v2f64(<2 x double> %a)
   ret { <2 x double>, <2 x double> } %result
 }
 
 define { fp128, fp128 } @test_sincos_f128(fp128 %a) {
-; CHECK-LABEL: test_sincos_f128:
-; CHECK:       @ %bb.0:
-; CHECK-NEXT:    push {r4, r5, r7, lr}
-; CHECK-NEXT:    sub sp, #40
-; CHECK-NEXT:    mov r12, r3
-; CHECK-NEXT:    ldr r3, [sp, #56]
-; CHECK-NEXT:    add.w lr, sp, #8
-; CHECK-NEXT:    mov r4, r0
-; CHECK-NEXT:    add r0, sp, #24
-; CHECK-NEXT:    strd r0, lr, [sp]
-; CHECK-NEXT:    mov r0, r1
-; CHECK-NEXT:    mov r1, r2
-; CHECK-NEXT:    mov r2, r12
-; CHECK-NEXT:    bl sincosl
-; CHECK-NEXT:    ldrd r2, r3, [sp, #16]
-; CHECK-NEXT:    ldrd r12, r1, [sp, #8]
-; CHECK-NEXT:    str r3, [r4, #28]
-; CHECK-NEXT:    ldrd r3, r5, [sp, #32]
-; CHECK-NEXT:    ldrd lr, r0, [sp, #24]
-; CHECK-NEXT:    strd r1, r2, [r4, #20]
-; CHECK-NEXT:    add.w r1, r4, #8
-; CHECK-NEXT:    stm.w r1, {r3, r5, r12}
-; CHECK-NEXT:    strd lr, r0, [r4]
-; CHECK-NEXT:    add sp, #40
-; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; GNU-LABEL: test_sincos_f128:
+; GNU:       @ %bb.0:
+; GNU-NEXT:    push {r4, r5, r7, lr}
+; GNU-NEXT:    sub sp, #40
+; GNU-NEXT:    mov r12, r3
+; GNU-NEXT:    ldr r3, [sp, #56]
+; GNU-NEXT:    add.w lr, sp, #8
+; GNU-NEXT:    mov r4, r0
+; GNU-NEXT:    add r0, sp, #24
+; GNU-NEXT:    strd r0, lr, [sp]
+; GNU-NEXT:    mov r0, r1
+; GNU-NEXT:    mov r1, r2
+; GNU-NEXT:    mov r2, r12
+; GNU-NEXT:    bl sincosl
+; GNU-NEXT:    ldrd r2, r3, [sp, #16]
+; GNU-NEXT:    ldrd r12, r1, [sp, #8]
+; GNU-NEXT:    str r3, [r4, #28]
+; GNU-NEXT:    ldrd r3, r5, [sp, #32]
+; GNU-NEXT:    ldrd lr, r0, [sp, #24]
+; GNU-NEXT:    strd r1, r2, [r4, #20]
+; GNU-NEXT:    add.w r1, r4, #8
+; GNU-NEXT:    stm.w r1, {r3, r5, r12}
+; GNU-NEXT:    strd lr, r0, [r4]
+; GNU-NEXT:    add sp, #40
+; GNU-NEXT:    pop {r4, r5, r7, pc}
+;
+; GNUEABI-LABEL: test_sincos_f128:
+; GNUEABI:       @ %bb.0:
+; GNUEABI-NEXT:    .save {r4, r5, r11, lr}
+; GNUEABI-NEXT:    push {r4, r5, r11, lr}
+; GNUEABI-NEXT:    .pad #40
+; GNUEABI-NEXT:    sub sp, sp, #40
+; GNUEABI-NEXT:    mov r12, r3
+; GNUEABI-NEXT:    ldr r3, [sp, #56]
+; GNUEABI-NEXT:    mov r4, r0
+; GNUEABI-NEXT:    add r0, sp, #24
+; GNUEABI-NEXT:    add r5, sp, #8
+; GNUEABI-NEXT:    stm sp, {r0, r5}
+; GNUEABI-NEXT:    mov r0, r1
+; GNUEABI-NEXT:    mov r1, r2
+; GNUEABI-NEXT:    mov r2, r12
+; GNUEABI-NEXT:    bl sincosl
+; GNUEABI-NEXT:    add r3, sp, #12
+; GNUEABI-NEXT:    ldr r12, [sp, #8]
+; GNUEABI-NEXT:    ldm r3, {r1, r2, r3}
+; GNUEABI-NEXT:    str r3, [r4, #28]
+; GNUEABI-NEXT:    ldr r0, [sp, #32]
+; GNUEABI-NEXT:    ldr lr, [sp, #24]
+; GNUEABI-NEXT:    ldr r5, [sp, #28]
+; GNUEABI-NEXT:    ldr r3, [sp, #36]
+; GNUEABI-NEXT:    str r2, [r4, #24]
+; GNUEABI-NEXT:    str r1, [r4, #20]
+; GNUEABI-NEXT:    add r1, r4, #8
+; GNUEABI-NEXT:    stm r1, {r0, r3, r12}
+; GNUEABI-NEXT:    str r5, [r4, #4]
+; GNUEABI-NEXT:    str lr, [r4]
+; GNUEABI-NEXT:    add sp, sp, #40
+; GNUEABI-NEXT:    pop {r4, r5, r11, pc}
+;
+; IOS-LABEL: test_sincos_f128:
+; IOS:       @ %bb.0:
+; IOS-NEXT:    push {r4, r5, r6, r7, r8, lr}
+; IOS-NEXT:    ldr r8, [sp, #24]
+; IOS-NEXT:    mov r4, r0
+; IOS-NEXT:    mov r5, r3
+; IOS-NEXT:    mov r6, r2
+; IOS-NEXT:    mov r7, r1
+; IOS-NEXT:    mov r0, r1
+; IOS-NEXT:    mov r1, r2
+; IOS-NEXT:    mov r2, r3
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _cosl
+; IOS-NEXT:    add r9, r4, #16
+; IOS-NEXT:    stm r9, {r0, r1, r2, r3}
+; IOS-NEXT:    mov r0, r7
+; IOS-NEXT:    mov r1, r6
+; IOS-NEXT:    mov r2, r5
+; IOS-NEXT:    mov r3, r8
+; IOS-NEXT:    bl _sinl
+; IOS-NEXT:    stm r4, {r0, r1, r2, r3}
+; IOS-NEXT:    pop {r4, r5, r6, r7, r8, pc}
+;
+; WATCHABI-LABEL: test_sincos_f128:
+; WATCHABI:         .cfi_startproc
+; WATCHABI-NEXT:  @ %bb.0:
+; WATCHABI-NEXT:    push.w {r4, r5, r6, r7, r8, lr}
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 24
+; WATCHABI-NEXT:    .cfi_offset lr, -4
+; WATCHABI-NEXT:    .cfi_offset r7, -8
+; WATCHABI-NEXT:    .cfi_offset r6, -12
+; WATCHABI-NEXT:    .cfi_offset r5, -16
+; WATCHABI-NEXT:    .cfi_offset r4, -20
+; WATCHABI-NEXT:    .cfi_offset r8, -24
+; WATCHABI-NEXT:    sub sp, #8
+; WATCHABI-NEXT:    .cfi_def_cfa_offset 32
+; WATCHABI-NEXT:    ldr.w r8, [sp, #32]
+; WATCHABI-NEXT:    mov r4, r0
+; WATCHABI-NEXT:    mov r5, r3
+; WATCHABI-NEXT:    mov r6, r2
+; WATCHABI-NEXT:    mov r7, r1
+; WATCHABI-NEXT:    mov r0, r1
+; WATCHABI-NEXT:    mov r1, r2
+; WATCHABI-NEXT:    mov r2, r3
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _cosl
+; WATCHABI-NEXT:    add.w r9, r4, #16
+; WATCHABI-NEXT:    stm.w r9, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    mov r0, r7
+; WATCHABI-NEXT:    mov r1, r6
+; WATCHABI-NEXT:    mov r2, r5
+; WATCHABI-NEXT:    mov r3, r8
+; WATCHABI-NEXT:    bl _sinl
+; WATCHABI-NEXT:    stm r4!, {r0, r1, r2, r3}
+; WATCHABI-NEXT:    add sp, #8
+; WATCHABI-NEXT:    pop.w {r4, r5, r6, r7, r8, pc}
+; WATCHABI-NEXT:    .cfi_endproc
   %result = call { fp128, fp128 } @llvm.sincos.f16(fp128 %a)
   ret { fp128, fp128 } %result
 }
diff --git a/llvm/test/CodeGen/BPF/bpf_trap.ll b/llvm/test/CodeGen/BPF/bpf_trap.ll
new file mode 100644
index 0000000..ab8df5f
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/bpf_trap.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s | FileCheck %s
+;
+target triple = "bpf"
+
+define i32 @test(i8 %x) {
+entry:
+  %0 = and i8 %x, 3
+  switch i8 %0, label %default.unreachable4 [
+    i8 0, label %return
+    i8 1, label %sw.bb1
+    i8 2, label %sw.bb2
+    i8 3, label %sw.bb3
+  ]
+
+sw.bb1:                                           ; preds = %entry
+  br label %return
+
+sw.bb2:                                           ; preds = %entry
+  br label %return
+
+sw.bb3:                                           ; preds = %entry
+  br label %return
+
+default.unreachable4:                             ; preds = %entry
+  unreachable
+
+return:                                           ; preds = %entry, %sw.bb3, %sw.bb2, %sw.bb1
+  %retval.0 = phi i32 [ 12, %sw.bb1 ], [ 43, %sw.bb2 ], [ 54, %sw.bb3 ], [ 32, %entry ]
+  ret i32 %retval.0
+}
+
+; CHECK-NOT: __bpf_trap
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
index 48ec98c..8e08e1e 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -5,40 +5,10 @@
 define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT:    fmin.s $fa4, $fa5, $fa4
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT:    fmin.s $fa2, $fa5, $fa2
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %x
@@ -51,23 +21,9 @@ entry:
 define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT:    fmin.d $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT:    fmin.d $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmin.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -81,40 +37,10 @@ entry:
 define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v8f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 5
-; CHECK-NEXT:    xvpickve.w $xr3, $xr1, 5
-; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.w $xr3, $xr0, 4
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 4
-; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 6
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 6
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 7
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 7
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 48
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.w $xr4, $xr1, 1
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.w $xr4, $xr0, 0
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 0
-; CHECK-NEXT:    fmax.s $fa4, $fa5, $fa4
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 16
-; CHECK-NEXT:    xvpickve.w $xr2, $xr0, 2
-; CHECK-NEXT:    xvpickve.w $xr5, $xr1, 2
-; CHECK-NEXT:    fmax.s $fa2, $fa5, $fa2
-; CHECK-NEXT:    vextrins.w $vr4, $vr2, 32
-; CHECK-NEXT:    xvpickve.w $xr0, $xr0, 3
-; CHECK-NEXT:    xvpickve.w $xr1, $xr1, 3
-; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr4, $vr0, 48
-; CHECK-NEXT:    xvpermi.q $xr4, $xr3, 2
-; CHECK-NEXT:    xvst $xr4, $a0, 0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <8 x float>, ptr %x
@@ -127,23 +53,9 @@ entry:
 define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xvld $xr0, $a2, 0
-; CHECK-NEXT:    xvld $xr1, $a1, 0
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 3
-; CHECK-NEXT:    xvpickve.d $xr3, $xr1, 3
-; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    xvpickve.d $xr3, $xr0, 2
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 2
-; CHECK-NEXT:    fmax.d $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.d $vr3, $vr2, 16
-; CHECK-NEXT:    xvpickve.d $xr2, $xr0, 1
-; CHECK-NEXT:    xvpickve.d $xr4, $xr1, 1
-; CHECK-NEXT:    fmax.d $fa2, $fa4, $fa2
-; CHECK-NEXT:    xvpickve.d $xr0, $xr0, 0
-; CHECK-NEXT:    xvpickve.d $xr1, $xr1, 0
-; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
-; CHECK-NEXT:    xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfmax.d $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
index 27ecb75..c173092 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -5,24 +5,10 @@
 define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT:    fmin.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT:    fmin.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT:    fmin.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT:    fmin.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %x
@@ -35,15 +21,9 @@ entry:
 define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: minnum_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT:    fmin.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    fmin.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmin.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
@@ -57,24 +37,10 @@ entry:
 define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.w $vr3, $vr1, 1
-; CHECK-NEXT:    fmax.s $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.w $vr3, $vr0, 0
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 0
-; CHECK-NEXT:    fmax.s $fa3, $fa4, $fa3
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 16
-; CHECK-NEXT:    vreplvei.w $vr2, $vr0, 2
-; CHECK-NEXT:    vreplvei.w $vr4, $vr1, 2
-; CHECK-NEXT:    fmax.s $fa2, $fa4, $fa2
-; CHECK-NEXT:    vextrins.w $vr3, $vr2, 32
-; CHECK-NEXT:    vreplvei.w $vr0, $vr0, 3
-; CHECK-NEXT:    vreplvei.w $vr1, $vr1, 3
-; CHECK-NEXT:    fmax.s $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.w $vr3, $vr0, 48
-; CHECK-NEXT:    vst $vr3, $a0, 0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.s $vr0, $vr0, $vr1
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
   %v0 = load <4 x float>, ptr %x
@@ -87,15 +53,9 @@ entry:
 define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
 ; CHECK-LABEL: maxnum_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vld $vr0, $a2, 0
-; CHECK-NEXT:    vld $vr1, $a1, 0
-; CHECK-NEXT:    vreplvei.d $vr2, $vr0, 1
-; CHECK-NEXT:    vreplvei.d $vr3, $vr1, 1
-; CHECK-NEXT:    fmax.d $fa2, $fa3, $fa2
-; CHECK-NEXT:    vreplvei.d $vr0, $vr0, 0
-; CHECK-NEXT:    vreplvei.d $vr1, $vr1, 0
-; CHECK-NEXT:    fmax.d $fa0, $fa1, $fa0
-; CHECK-NEXT:    vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
+; CHECK-NEXT:    vfmax.d $vr0, $vr0, $vr1
 ; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
new file mode 100644
index 0000000..d3853e2
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/cmpxchg-unsupported-syncscope.err.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -mcpu=sm_100a -mtriple=nvptx64 -mattr=+ptx86 %s 2>&1 | FileCheck %s
+
+; Test that we get a clear error message when using an unsupported syncscope.
+
+; CHECK: NVPTX backend does not support syncscope "agent"
+; CHECK: Supported syncscopes are: singlethread, <empty string>, block, cluster, device
+define i32 @cmpxchg_unsupported_syncscope_agent(ptr %addr, i32 %cmp, i32 %new) {
+  %result = cmpxchg ptr %addr, i32 %cmp, i32 %new syncscope("agent") monotonic monotonic
+  %value = extractvalue { i32, i1 } %result, 0
+  ret i32 %value
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
new file mode 100644
index 0000000..5cb55f1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
new file mode 100644
index 0000000..fafd45b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vloxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vloxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vloxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vloxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vloxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vloxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vloxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vloxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vloxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vloxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vloxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vloxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vloxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vloxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vloxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vloxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vloxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vloxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vloxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vloxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vloxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vloxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vloxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vloxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vloxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vloxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vloxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vloxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vloxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vloxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vloxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vloxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vloxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vloxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vloxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vloxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vloxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vloxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vloxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vloxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vloxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
new file mode 100644
index 0000000..916af25
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei-rv64.ll
@@ -0,0 +1,1341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei64.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64(ptr %0, <vscale x 1 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64,
+  i64);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64(ptr %0, <vscale x 2 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64,
+  i64);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64(ptr %0, <vscale x 4 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64,
+  i64);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64(ptr %0, <vscale x 8 x i64> %1, i64 %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei64.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i64> %1,
+    i64 %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64,
+  i64);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4, i64 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
new file mode 100644
index 0000000..8dd32a1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vluxei.ll
@@ -0,0 +1,5100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32(ptr %0, <vscale x 16 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32(ptr %0, <vscale x 1 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei32.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32(ptr %0, <vscale x 2 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32(ptr %0, <vscale x 4 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32(ptr %0, <vscale x 8 x i32> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i32> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v10, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v12, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v16, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16(ptr %0, <vscale x 32 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16(ptr %0, <vscale x 16 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16(ptr %0, <vscale x 1 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei16.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16(ptr %0, <vscale x 2 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16(ptr %0, <vscale x 4 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16(ptr %0, <vscale x 8 x i16> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i16> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i8> @intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i8> @llvm.riscv.vluxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i8> @intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i8> @llvm.riscv.vluxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i8> @intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i8> @llvm.riscv.vluxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i8> @intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i8> @llvm.riscv.vluxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i8> @intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i8> @llvm.riscv.vluxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i8> @intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i8> @llvm.riscv.vluxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8(ptr %0, <vscale x 64 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> poison,
+    ptr %0,
+    <vscale x 64 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 64 x i8> @intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 64 x i8> @llvm.riscv.vluxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 64 x i8> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i16> @intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i16> @llvm.riscv.vluxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i16> @intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i16> @llvm.riscv.vluxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i16> @intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i16> @llvm.riscv.vluxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i16> @intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i16> @llvm.riscv.vluxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i16> @intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i16> @llvm.riscv.vluxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x i16> @intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x i16> @llvm.riscv.vluxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x i16> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i32> @intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i32> @llvm.riscv.vluxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i32> @intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i32> @llvm.riscv.vluxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i32> @intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i32> @llvm.riscv.vluxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i32> @intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i32> @llvm.riscv.vluxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x i32> @intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x i32> @llvm.riscv.vluxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x i32> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x i64> @intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x i64> @llvm.riscv.vluxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x i64> @intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x i64> @llvm.riscv.vluxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x i64> @intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x i64> @llvm.riscv.vluxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x i64> @intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x i64> @llvm.riscv.vluxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x i64> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x half> @intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.vluxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x half> @intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.vluxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x half> @intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.vluxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x half> @intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.vluxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x half> @intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.vluxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8(ptr %0, <vscale x 32 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv4r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> poison,
+    ptr %0,
+    <vscale x 32 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 32 x half> @intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.vluxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 32 x half> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x float> @intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.vluxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x float> @intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.vluxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x float> @intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.vluxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x float> @intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.vluxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8(ptr %0, <vscale x 16 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> poison,
+    ptr %0,
+    <vscale x 16 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 16 x float> @intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.vluxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 16 x float> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8(ptr %0, <vscale x 1 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vluxei8.v v9, (a0), v8
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> poison,
+    ptr %0,
+    <vscale x 1 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 1 x double> @intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x double> @llvm.riscv.vluxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 1 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8(ptr %0, <vscale x 2 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> poison,
+    ptr %0,
+    <vscale x 2 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 2 x double> @intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x double> @llvm.riscv.vluxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 2 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8(ptr %0, <vscale x 4 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v12, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> poison,
+    ptr %0,
+    <vscale x 4 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 4 x double> @intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x double> @llvm.riscv.vluxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 4 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8(ptr %0, <vscale x 8 x i8> %1, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> poison,
+    ptr %0,
+    <vscale x 8 x i8> %1,
+    iXLen %2)
+
+  ret <vscale x 8 x double> %a
+}
+
+declare <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen,
+  iXLen);
+
+define <vscale x 8 x double> @intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vluxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, mu
+; CHECK-NEXT:    vluxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x double> @llvm.riscv.vluxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4, iXLen 1)
+
+  ret <vscale x 8 x double> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
new file mode 100644
index 0000000..4963d91
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei-rv64.ll
@@ -0,0 +1,1293 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
new file mode 100644
index 0000000..7ea2e17
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsoxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsoxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsoxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsoxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
new file mode 100644
index 0000000..9bd272a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei-rv64.ll
@@ -0,0 +1,1310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin -global-isel -verify-machineinstrs \
+; RUN:   < %s | FileCheck %s
+
+; The intrinsics are not supported with RV32.
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+define void @intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_allonesmask_v_nxv1i8_nxv1i8_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i64(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> splat (i1 true),
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i64(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i64(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i64(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i64(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i64(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i64(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i64(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i64(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i64(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i64(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i64(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i64(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i64(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i64(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i64(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i64(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i64(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i64(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i64(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i64(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i64(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i64(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i64(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i64>,
+  <vscale x 1 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i64> %2, <vscale x 1 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i64(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i64> %2,
+    <vscale x 1 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i64>,
+  <vscale x 2 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i64> %2, <vscale x 2 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i64(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i64> %2,
+    <vscale x 2 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i64>,
+  <vscale x 4 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i64> %2, <vscale x 4 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i64(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i64> %2,
+    <vscale x 4 x i1> %3,
+    i64 %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  i64);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, i64 %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    i64 %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i64>,
+  <vscale x 8 x i1>,
+  i64);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i64> %2, <vscale x 8 x i1> %3, i64 %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei64.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i64(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i64> %2,
+    <vscale x 8 x i1> %3,
+    i64 %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
new file mode 100644
index 0000000..7cd1545
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/rvv/vsuxei.ll
@@ -0,0 +1,4881 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=ilp32d | FileCheck %s
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin \
+; RUN:   -global-isel -verify-machineinstrs -target-abi=lp64d | FileCheck %s
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i32(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i32(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i32(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i32(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i32(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i32(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i32(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i32(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i32(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i32(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i32(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i32(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i32(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i32(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i32(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i32(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i32(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i32(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i32(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i32(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i32(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i32(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i32(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i32(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i32(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i32(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i32(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i32(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i32>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i32> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i32(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i32> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i32>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i32> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i32(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i32> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i32>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i32> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i32(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i32> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i32>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i32> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i32(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i32> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i32>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i32> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei32.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i32(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i32> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i16(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i16(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i16(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i16(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i16(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i16(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i16(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i16(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i16(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i16(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i16(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i16(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i16(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i16(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i16(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i16(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i16(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i16(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i16(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i16(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i16(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i16(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i16(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i16(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i16(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i16(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i16>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i16> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i16(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i16> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i16(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i16(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i16(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i16(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i16>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i16> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i16(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i16> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i16>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i16> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i16(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i16> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i16>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i16> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i16(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i16> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i16>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i16> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i16(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i16> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i16>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i16> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei16.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i16(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i16> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+  <vscale x 1 x i8>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8(<vscale x 1 x i8> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i8_nxv1i8_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i8.nxv1i8(
+    <vscale x 1 x i8> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+  <vscale x 2 x i8>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8(<vscale x 2 x i8> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i8_nxv2i8_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i8.nxv2i8(
+    <vscale x 2 x i8> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+  <vscale x 4 x i8>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8(<vscale x 4 x i8> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i8_nxv4i8_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i8.nxv4i8(
+    <vscale x 4 x i8> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+  <vscale x 8 x i8>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8(<vscale x 8 x i8> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i8_nxv8i8_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i8.nxv8i8(
+    <vscale x 8 x i8> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+  <vscale x 16 x i8>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8(<vscale x 16 x i8> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i8_nxv16i8_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i8.nxv16i8(
+    <vscale x 16 x i8> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+  <vscale x 32 x i8>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8(<vscale x 32 x i8> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i8_nxv32i8_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i8.nxv32i8(
+    <vscale x 32 x i8> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+  <vscale x 64 x i8>,
+  ptr,
+  <vscale x 64 x i8>,
+  <vscale x 64 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8(<vscale x 64 x i8> %0, ptr %1, <vscale x 64 x i8> %2, <vscale x 64 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv64i8_nxv64i8_nxv64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv64i8.nxv64i8(
+    <vscale x 64 x i8> %0,
+    ptr %1,
+    <vscale x 64 x i8> %2,
+    <vscale x 64 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+  <vscale x 1 x i16>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8(<vscale x 1 x i16> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i16_nxv1i16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i16.nxv1i8(
+    <vscale x 1 x i16> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+  <vscale x 2 x i16>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8(<vscale x 2 x i16> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i16_nxv2i16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i16.nxv2i8(
+    <vscale x 2 x i16> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+  <vscale x 4 x i16>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8(<vscale x 4 x i16> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i16_nxv4i16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i16.nxv4i8(
+    <vscale x 4 x i16> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+  <vscale x 8 x i16>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8(<vscale x 8 x i16> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i16_nxv8i16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i16.nxv8i8(
+    <vscale x 8 x i16> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+  <vscale x 16 x i16>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8(<vscale x 16 x i16> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i16_nxv16i16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i16.nxv16i8(
+    <vscale x 16 x i16> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+  <vscale x 32 x i16>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8(<vscale x 32 x i16> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32i16_nxv32i16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32i16.nxv32i8(
+    <vscale x 32 x i16> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+  <vscale x 1 x i32>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8(<vscale x 1 x i32> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i32_nxv1i32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i32.nxv1i8(
+    <vscale x 1 x i32> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+  <vscale x 2 x i32>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8(<vscale x 2 x i32> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i32_nxv2i32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i32.nxv2i8(
+    <vscale x 2 x i32> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+  <vscale x 4 x i32>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8(<vscale x 4 x i32> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i32_nxv4i32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i32.nxv4i8(
+    <vscale x 4 x i32> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+  <vscale x 8 x i32>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8(<vscale x 8 x i32> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i32_nxv8i32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i32.nxv8i8(
+    <vscale x 8 x i32> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+  <vscale x 16 x i32>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8(<vscale x 16 x i32> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16i32_nxv16i32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16i32.nxv16i8(
+    <vscale x 16 x i32> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+  <vscale x 1 x i64>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8(<vscale x 1 x i64> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1i64_nxv1i64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1i64.nxv1i8(
+    <vscale x 1 x i64> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+  <vscale x 2 x i64>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8(<vscale x 2 x i64> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2i64_nxv2i64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2i64.nxv2i8(
+    <vscale x 2 x i64> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+  <vscale x 4 x i64>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8(<vscale x 4 x i64> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4i64_nxv4i64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4i64.nxv4i8(
+    <vscale x 4 x i64> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+  <vscale x 8 x i64>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8(<vscale x 8 x i64> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8i64_nxv8i64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8i64.nxv8i8(
+    <vscale x 8 x i64> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+  <vscale x 1 x half>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8(<vscale x 1 x half> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f16_nxv1f16_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f16.nxv1i8(
+    <vscale x 1 x half> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+  <vscale x 2 x half>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8(<vscale x 2 x half> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f16_nxv2f16_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f16.nxv2i8(
+    <vscale x 2 x half> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+  <vscale x 4 x half>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8(<vscale x 4 x half> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f16_nxv4f16_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f16.nxv4i8(
+    <vscale x 4 x half> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+  <vscale x 8 x half>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8(<vscale x 8 x half> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f16_nxv8f16_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f16.nxv8i8(
+    <vscale x 8 x half> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+  <vscale x 16 x half>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8(<vscale x 16 x half> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f16_nxv16f16_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f16.nxv16i8(
+    <vscale x 16 x half> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+  <vscale x 32 x half>,
+  ptr,
+  <vscale x 32 x i8>,
+  <vscale x 32 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8(<vscale x 32 x half> %0, ptr %1, <vscale x 32 x i8> %2, <vscale x 32 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv32f16_nxv32f16_nxv32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv32f16.nxv32i8(
+    <vscale x 32 x half> %0,
+    ptr %1,
+    <vscale x 32 x i8> %2,
+    <vscale x 32 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+  <vscale x 1 x float>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8(<vscale x 1 x float> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f32_nxv1f32_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f32.nxv1i8(
+    <vscale x 1 x float> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+  <vscale x 2 x float>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8(<vscale x 2 x float> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f32_nxv2f32_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f32.nxv2i8(
+    <vscale x 2 x float> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+  <vscale x 4 x float>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8(<vscale x 4 x float> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f32_nxv4f32_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f32.nxv4i8(
+    <vscale x 4 x float> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+  <vscale x 8 x float>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8(<vscale x 8 x float> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f32_nxv8f32_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f32.nxv8i8(
+    <vscale x 8 x float> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+  <vscale x 16 x float>,
+  ptr,
+  <vscale x 16 x i8>,
+  <vscale x 16 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8(<vscale x 16 x float> %0, ptr %1, <vscale x 16 x i8> %2, <vscale x 16 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv16f32_nxv16f32_nxv16i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv16f32.nxv16i8(
+    <vscale x 16 x float> %0,
+    ptr %1,
+    <vscale x 16 x i8> %2,
+    <vscale x 16 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+  <vscale x 1 x double>,
+  ptr,
+  <vscale x 1 x i8>,
+  <vscale x 1 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8(<vscale x 1 x double> %0, ptr %1, <vscale x 1 x i8> %2, <vscale x 1 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv1f64_nxv1f64_nxv1i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv1f64.nxv1i8(
+    <vscale x 1 x double> %0,
+    ptr %1,
+    <vscale x 1 x i8> %2,
+    <vscale x 1 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+  <vscale x 2 x double>,
+  ptr,
+  <vscale x 2 x i8>,
+  <vscale x 2 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8(<vscale x 2 x double> %0, ptr %1, <vscale x 2 x i8> %2, <vscale x 2 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv2f64_nxv2f64_nxv2i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv2f64.nxv2i8(
+    <vscale x 2 x double> %0,
+    ptr %1,
+    <vscale x 2 x i8> %2,
+    <vscale x 2 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+  <vscale x 4 x double>,
+  ptr,
+  <vscale x 4 x i8>,
+  <vscale x 4 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8(<vscale x 4 x double> %0, ptr %1, <vscale x 4 x i8> %2, <vscale x 4 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv4f64_nxv4f64_nxv4i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv4f64.nxv4i8(
+    <vscale x 4 x double> %0,
+    ptr %1,
+    <vscale x 4 x i8> %2,
+    <vscale x 4 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  iXLen);
+
+define void @intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    iXLen %3)
+
+  ret void
+}
+
+declare void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+  <vscale x 8 x double>,
+  ptr,
+  <vscale x 8 x i8>,
+  <vscale x 8 x i1>,
+  iXLen);
+
+define void @intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8(<vscale x 8 x double> %0, ptr %1, <vscale x 8 x i8> %2, <vscale x 8 x i1> %3, iXLen %4) nounwind {
+; CHECK-LABEL: intrinsic_vsuxei_mask_v_nxv8f64_nxv8f64_nxv8i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vsuxei8.v v8, (a0), v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  call void @llvm.riscv.vsuxei.mask.nxv8f64.nxv8i8(
+    <vscale x 8 x double> %0,
+    ptr %1,
+    <vscale x 8 x i8> %2,
+    <vscale x 8 x i1> %3,
+    iXLen %4)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 988d049..cf44af6 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -137,6 +137,7 @@
 ; CHECK-NEXT:   shifted-zextw-fusion             - Enable SLLI+SRLI to be fused when computing (shifted) word zero extension.
 ; CHECK-NEXT:   shlcofideleg                     - 'Shlcofideleg' (Delegating LCOFI Interrupts to VS-mode).
 ; CHECK-NEXT:   short-forward-branch-i-minmax    - Enable short forward branch optimization for min,max instructions in Zbb.
+; CHECK-NEXT:   short-forward-branch-i-mul       - Enable short forward branch optimization for mul instruction.
 ; CHECK-NEXT:   short-forward-branch-opt         - Enable short forward branch optimization.
 ; CHECK-NEXT:   shtvala                          - 'Shtvala' (htval provides all needed values).
 ; CHECK-NEXT:   shvsatpa                         - 'Shvsatpa' (vsatp supports all modes supported by satp).
diff --git a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
index c489bc3..aa63552 100644
--- a/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
+++ b/llvm/test/CodeGen/RISCV/replace-with-veclib-sleef-scalable.ll
@@ -488,5 +488,5 @@ declare <vscale x 2 x double> @llvm.trunc.nxv2f64(<vscale x 2 x double>)
 declare <vscale x 4 x float> @llvm.trunc.nxv4f32(<vscale x 4 x float>)
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "target-features"="+v" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nocreateundeforpoison nofree nosync nounwind speculatable willreturn memory(none) "target-features"="+v" }
 ;.
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
new file mode 100644
index 0000000..bf0a2e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap-fp.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh < %s | FileCheck %s
+
+; CHECK-LABEL:  .section	.llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   0
+; Num Functions
+; CHECK-NEXT:   .word   1
+; Num LargeConstants
+; CHECK-NEXT:   .word   0
+; Num Callsites
+; CHECK-NEXT:   .word   1
+
+; Functions and stack size
+; CHECK-NEXT:   .quad   liveArgs
+; CHECK-NEXT:   .quad   0
+; CHECK-NEXT:   .quad   1
+
+; Spilled stack map values.
+;
+; Verify 3 stack map entries.
+;
+; CHECK-LABEL:  .word   .L{{.*}}-liveArgs
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .half   25
+;
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
+; CHECK:        .byte   3
+; CHECK-NEXT:   .byte   0
+; CHECK-NEXT:   .half   8
+; CHECK-NEXT:   .half   2
+; CHECK-NEXT:   .half   0
+; CHECK-NEXT:   .word
+define void @liveArgs(double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29) {
+entry:
+  call void (i64, i32, ptr, i32, ...) @llvm.experimental.patchpoint.void(i64 11, i32 28, ptr null, i32 5, double %arg0, double %arg1, double %arg2, double %arg3, double %arg4, double %arg5, double %arg6, double %arg7, double %arg8, double %arg9, double %arg10, double %arg11, double %arg12, double %arg13, double %arg14, double %arg15, double %arg16, double %arg17, double %arg18, double %arg19, double %arg20, double %arg21, double %arg22, double %arg23, half %arg24, half %arg25, half %arg26, half %arg27, half %arg28, bfloat %arg29)
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index c50a0fb3..320a3aa 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -286,8 +286,8 @@ define void @liveConstant() {
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .half   28
 ;
-; Check that at least one is a spilled entry from RBP.
-; Location: Indirect RBP + ...
+; Check that at least one is a spilled entry from SP.
+; Location: Indirect SP + ...
 ; CHECK:        .byte   3
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -307,7 +307,7 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 1 location
 ; CHECK-NEXT:   .half   1
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
@@ -320,14 +320,14 @@ entry:
 ; CHECK-NEXT:   .half   0
 ; 2 locations
 ; CHECK-NEXT:   .half   2
-; Loc 0: Direct RBP - ofs
+; Loc 0: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
 ; CHECK-NEXT:   .half   2
 ; CHECK-NEXT:   .half   0
 ; CHECK-NEXT:   .word
-; Loc 1: Direct RBP - ofs
+; Loc 1: Direct SP + ofs
 ; CHECK-NEXT:   .byte   2
 ; CHECK-NEXT:   .byte   0
 ; CHECK-NEXT:   .half   8
diff --git a/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
new file mode 100644
index 0000000..3f780fd
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/short-forward-branch-opt-mul.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m | FileCheck %s --check-prefixes=RV32I-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m | FileCheck %s --check-prefixes=RV64I-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFB-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-opt | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFB-M
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV32I-SFBIMul-M
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+short-forward-branch-i-mul | \
+; RUN:   FileCheck %s --check-prefixes=RV64I-SFBIMul-M
+
+define i32 @select_example_mul_i32(i32 %a, i32 %b, i1 zeroext %x, i32 %y) {
+; RV32I-M-LABEL: select_example_mul_i32:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a1, a0, a3
+; RV32I-M-NEXT:  .LBB0_2: # %entry
+; RV32I-M-NEXT:    mv a0, a1
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i32:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB0_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mulw a1, a0, a3
+; RV64I-M-NEXT:  .LBB0_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i32:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a0, a0, a3
+; RV32I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a1
+; RV32I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i32:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a2, .LBB0_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV32I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a1
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i32:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    mulw a0, a0, a3
+; RV64I-SFBIMul-M-NEXT:    bnez a2, .LBB0_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:  .LBB0_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i32 %a, %y
+  %sel = select i1 %x, i32 %res, i32 %b
+  ret i32 %sel
+}
+
+define i64 @select_example_mul_i64(i64 %a, i64 %b, i1 zeroext %x, i64 %y) {
+; RV32I-M-LABEL: select_example_mul_i64:
+; RV32I-M:       # %bb.0: # %entry
+; RV32I-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-M-NEXT:  # %bb.1:
+; RV32I-M-NEXT:    mul a2, a0, a6
+; RV32I-M-NEXT:    mulhu a3, a0, a5
+; RV32I-M-NEXT:    mul a1, a1, a5
+; RV32I-M-NEXT:    add a2, a3, a2
+; RV32I-M-NEXT:    add a3, a2, a1
+; RV32I-M-NEXT:    mul a2, a0, a5
+; RV32I-M-NEXT:  .LBB1_2: # %entry
+; RV32I-M-NEXT:    mv a0, a2
+; RV32I-M-NEXT:    mv a1, a3
+; RV32I-M-NEXT:    ret
+;
+; RV64I-M-LABEL: select_example_mul_i64:
+; RV64I-M:       # %bb.0: # %entry
+; RV64I-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-M-NEXT:  # %bb.1:
+; RV64I-M-NEXT:    mul a1, a0, a3
+; RV64I-M-NEXT:  .LBB1_2: # %entry
+; RV64I-M-NEXT:    mv a0, a1
+; RV64I-M-NEXT:    ret
+;
+; RV32I-SFB-M-LABEL: select_example_mul_i64:
+; RV32I-SFB-M:       # %bb.0: # %entry
+; RV32I-SFB-M-NEXT:    mul a6, a0, a6
+; RV32I-SFB-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFB-M-NEXT:    mul a1, a1, a5
+; RV32I-SFB-M-NEXT:    mul a0, a0, a5
+; RV32I-SFB-M-NEXT:    add a6, a7, a6
+; RV32I-SFB-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFB-M-NEXT:    add a3, a6, a1
+; RV32I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFB-M-NEXT:    bnez a4, .LBB1_4
+; RV32I-SFB-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFB-M-NEXT:    mv a0, a2
+; RV32I-SFB-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFB-M-NEXT:    mv a1, a3
+; RV32I-SFB-M-NEXT:    ret
+;
+; RV64I-SFB-M-LABEL: select_example_mul_i64:
+; RV64I-SFB-M:       # %bb.0: # %entry
+; RV64I-SFB-M-NEXT:    mul a0, a0, a3
+; RV64I-SFB-M-NEXT:    bnez a2, .LBB1_2
+; RV64I-SFB-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFB-M-NEXT:    mv a0, a1
+; RV64I-SFB-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFB-M-NEXT:    ret
+;
+; RV32I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV32I-SFBIMul-M:       # %bb.0: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a6, a0, a6
+; RV32I-SFBIMul-M-NEXT:    mulhu a7, a0, a5
+; RV32I-SFBIMul-M-NEXT:    mul a1, a1, a5
+; RV32I-SFBIMul-M-NEXT:    add a6, a7, a6
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_2
+; RV32I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV32I-SFBIMul-M-NEXT:    add a3, a6, a1
+; RV32I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV32I-SFBIMul-M-NEXT:    beqz a4, .LBB1_4
+; RV32I-SFBIMul-M-NEXT:  # %bb.3: # %entry
+; RV32I-SFBIMul-M-NEXT:    mul a2, a0, a5
+; RV32I-SFBIMul-M-NEXT:  .LBB1_4: # %entry
+; RV32I-SFBIMul-M-NEXT:    mv a0, a2
+; RV32I-SFBIMul-M-NEXT:    mv a1, a3
+; RV32I-SFBIMul-M-NEXT:    ret
+;
+; RV64I-SFBIMul-M-LABEL: select_example_mul_i64:
+; RV64I-SFBIMul-M:       # %bb.0: # %entry
+; RV64I-SFBIMul-M-NEXT:    beqz a2, .LBB1_2
+; RV64I-SFBIMul-M-NEXT:  # %bb.1: # %entry
+; RV64I-SFBIMul-M-NEXT:    mul a1, a0, a3
+; RV64I-SFBIMul-M-NEXT:  .LBB1_2: # %entry
+; RV64I-SFBIMul-M-NEXT:    mv a0, a1
+; RV64I-SFBIMul-M-NEXT:    ret
+entry:
+  %res = mul i64 %a, %y
+  %sel = select i1 %x, i64 %res, i64 %b
+  ret i64 %sel
+}
+
diff --git a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
index 73c46b1..c9b2968 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/composite-fun-fix-ptr-arg.ll
@@ -10,6 +10,7 @@
 
 ; CHECK-DAG: %[[#Int8:]] = OpTypeInt 8 0
 ; CHECK-DAG: %[[#Half:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#Float:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#Struct:]] = OpTypeStruct %[[#Half]]
 ; CHECK-DAG: %[[#Void:]] = OpTypeVoid
 ; CHECK-DAG: %[[#PtrInt8:]] = OpTypePointer CrossWorkgroup %[[#Int8:]]
@@ -17,12 +18,20 @@
 ; CHECK-DAG: %[[#Int64:]] = OpTypeInt 64 0
 ; CHECK-DAG: %[[#PtrInt64:]] = OpTypePointer CrossWorkgroup %[[#Int64]]
 ; CHECK-DAG: %[[#BarType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt64]] %[[#Struct]]
+; CHECK-DAG: %[[#BazType:]] = OpTypeFunction %[[#Void]] %[[#PtrInt8]] %[[#Struct]] %[[#Int8]] %[[#Struct]] %[[#Float]] %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#FooType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt8]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
 ; CHECK: OpFunction %[[#Void]] None %[[#BarType]]
 ; CHECK: OpFunctionParameter %[[#PtrInt64]]
 ; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunction %[[#Void]] None %[[#BazType]]
+; CHECK: OpFunctionParameter %[[#PtrInt8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Int8]]
+; CHECK: OpFunctionParameter %[[#Struct]]
+; CHECK: OpFunctionParameter %[[#Float]]
+; CHECK: OpFunctionParameter %[[#Struct]]
 
 %t_half = type { half }
 
@@ -38,4 +47,9 @@ entry:
   ret void
 }
 
+define spir_kernel void @baz(ptr addrspace(1) %a, %t_half %b, i8 %c, %t_half %d, float %e, %t_half %f) {
+entry:
+  ret void
+}
+
 declare spir_func %t_half @_Z29__spirv_SpecConstantComposite(half)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 8007d9d..040ae65 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,24 +203,14 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %edx
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB5_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:  .LBB5_2:
-; X86-NEXT:    andl 4(%eax), %esi
-; X86-NEXT:    andl (%eax), %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    setne %al
-; X86-NEXT:    popl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $32, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_ne_i64:
@@ -242,38 +232,20 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB6_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB6_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    xorl %esi, %edi
-; X86-NEXT:    xorl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: complement_ne_i64:
@@ -300,40 +272,20 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB7_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB7_2:
-; X86-NEXT:    movl (%edx), %eax
-; X86-NEXT:    movl 4(%edx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    notl %edi
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    andl %esi, %ebp
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    sete %al
-; X86-NEXT:    movl %esi, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: reset_eq_i64:
@@ -361,38 +313,20 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB8_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB8_2:
-; X86-NEXT:    movl (%edx), %ecx
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    andl %esi, %ebx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    andl %eax, %ebp
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %ebx, %ebp
-; X86-NEXT:    setne %al
-; X86-NEXT:    movl %ecx, (%edx)
-; X86-NEXT:    movl %edi, 4(%edx)
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: set_ne_i64:
@@ -419,52 +353,26 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl $1, %eax
-; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    xorl %edi, %edi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    je .LBB9_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:  .LBB9_2:
-; X86-NEXT:    movl %edx, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %eax, %ebp
-; X86-NEXT:    notl %ebp
-; X86-NEXT:    je .LBB9_4
-; X86-NEXT:  # %bb.3:
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    xorl %esi, %esi
-; X86-NEXT:  .LBB9_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %ecx, %ebx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $32, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shll %cl, %ebx
 ; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl (%edi), %ecx
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    andl %ecx, %ebp
-; X86-NEXT:    orl %esi, %ebp
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %ebp, (%edi)
-; X86-NEXT:    movl %ebx, 4(%edi)
-; X86-NEXT:    sete %al
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i64:
@@ -516,101 +424,25 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $48, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, (%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %esi
-; X86-NEXT:    movl 24(%esp,%esi), %edi
-; X86-NEXT:    movl 28(%esp,%esi), %eax
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 16(%esp,%esi), %edx
-; X86-NEXT:    movl 20(%esp,%esi), %esi
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    andl 8(%ebx), %edi
-; X86-NEXT:    andl (%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    andl 12(%ebx), %eax
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $96, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %rsi, %rax
-; SSE-NEXT:    andq 8(%rdi), %rdx
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movl $1, %edx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    shldq %cl, %rdx, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rdx, %rsi
-; AVX2-NEXT:    cmovneq %rax, %rdx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    andq (%rdi), %rdx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl $1, %eax
-; AVX512-NEXT:    xorl %edx, %edx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    xorl %esi, %esi
-; AVX512-NEXT:    shlxq %rcx, %rax, %rax
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rax, %rdx
-; AVX512-NEXT:    cmovneq %rsi, %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    andq (%rdi), %rax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $96, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -623,124 +455,33 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: complement_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    xorq %rcx, %rsi
-; SSE-NEXT:    xorq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: complement_ne_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    xorq %rcx, %rsi
-; AVX-NEXT:    xorq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
-; AVX-NEXT:    retq
+; X64-LABEL: complement_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -755,124 +496,33 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %edx
-; X86-NEXT:    movl 60(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %esi
-; X86-NEXT:    movl 52(%esp,%eax), %edi
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl 8(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl %ebx, %ecx
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %edi
-; X86-NEXT:    movl %edx, 8(%edi)
-; X86-NEXT:    movl %eax, 12(%edi)
-; X86-NEXT:    movl %esi, (%edi)
-; X86-NEXT:    movl %ecx, 4(%edi)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_eq_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    notq %rdx
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    andq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: reset_eq_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    andnq %rcx, %rsi, %r8
-; AVX-NEXT:    andq %rsi, %rcx
-; AVX-NEXT:    andnq %rax, %rdx, %rsi
-; AVX-NEXT:    andq %rdx, %rax
-; AVX-NEXT:    orq %rcx, %rax
-; AVX-NEXT:    sete %al
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %r8, 8(%rdi)
-; AVX-NEXT:    retq
+; X64-LABEL: reset_eq_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -888,124 +538,33 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrb $3, %al
-; X86-NEXT:    andb $12, %al
-; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl 8(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl (%ecx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 12(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 8(%eax)
-; X86-NEXT:    movl %esi, 12(%eax)
-; X86-NEXT:    movl %edi, (%eax)
-; X86-NEXT:    movl %ebx, 4(%eax)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: set_ne_i128:
-; SSE:       # %bb.0:
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %edx
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    shldq %cl, %rdx, %rsi
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rdx, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    movq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r8
-; SSE-NEXT:    andq %rsi, %r8
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    andq %rdx, %r9
-; SSE-NEXT:    orq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rdx
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    movq %rdx, (%rdi)
-; SSE-NEXT:    movq %rsi, 8(%rdi)
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: set_ne_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    movl $1, %edx
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    shldq %cl, %rdx, %rsi
-; AVX-NEXT:    shlxq %rcx, %rdx, %rdx
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %rdx, %rsi
-; AVX-NEXT:    cmovneq %rax, %rdx
-; AVX-NEXT:    movq (%rdi), %rax
-; AVX-NEXT:    movq 8(%rdi), %rcx
-; AVX-NEXT:    movq %rcx, %r8
-; AVX-NEXT:    andq %rsi, %r8
-; AVX-NEXT:    movq %rax, %r9
-; AVX-NEXT:    andq %rdx, %r9
-; AVX-NEXT:    orq %rcx, %rsi
-; AVX-NEXT:    orq %rax, %rdx
-; AVX-NEXT:    orq %r8, %r9
-; AVX-NEXT:    setne %al
-; AVX-NEXT:    movq %rdx, (%rdi)
-; AVX-NEXT:    movq %rsi, 8(%rdi)
-; AVX-NEXT:    retq
+; X64-LABEL: set_ne_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -1020,218 +579,55 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i128:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $128, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    movzbl 16(%ebp), %eax
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrb $3, %dl
-; X86-NEXT:    andb $12, %dl
-; X86-NEXT:    negb %dl
-; X86-NEXT:    movsbl %dl, %esi
-; X86-NEXT:    movl 64(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%esi), %ebx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 76(%esp,%esi), %edi
-; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %edi, %esi
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl 12(%ecx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ecx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NEXT:    movl 104(%esp,%ecx), %ecx
-; X86-NEXT:    movl %ecx, %ebx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    movzbl 12(%ebp), %ecx
-; X86-NEXT:    shldl %cl, %edi, %ebx
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 108(%esp,%ebx), %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 96(%esp,%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shll %cl, %ebx
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edi
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%ecx)
-; X86-NEXT:    movl %esi, 12(%ecx)
-; X86-NEXT:    movl %eax, (%ecx)
-; X86-NEXT:    movl %edx, 4(%ecx)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i128:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    shlq %cl, %rsi
-; SSE-NEXT:    movl %edx, %eax
-; SSE-NEXT:    xorl %edx, %edx
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    xorl %r9d, %r9d
-; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %r9, %rsi
-; SSE-NEXT:    cmovneq %rax, %rdx
-; SSE-NEXT:    cmovneq %r9, %rax
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    andl $96, %esi
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i128:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl $1, %esi
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    xorl %r8d, %r8d
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX2-NEXT:    testb $64, %cl
-; AVX2-NEXT:    cmovneq %rsi, %rax
-; AVX2-NEXT:    cmovneq %r8, %rsi
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX2-NEXT:    cmovneq %rcx, %r9
-; AVX2-NEXT:    cmovneq %r8, %rcx
-; AVX2-NEXT:    movq (%rdi), %rdx
-; AVX2-NEXT:    movq 8(%rdi), %r8
-; AVX2-NEXT:    andnq %r8, %rax, %r10
-; AVX2-NEXT:    andq %rax, %r8
-; AVX2-NEXT:    andnq %rdx, %rsi, %r11
-; AVX2-NEXT:    andq %rsi, %rdx
-; AVX2-NEXT:    orq %r9, %r10
-; AVX2-NEXT:    orq %rcx, %r11
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    movq %r11, (%rdi)
-; AVX2-NEXT:    movq %r10, 8(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i128:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movl $1, %esi
-; AVX512-NEXT:    xorl %r8d, %r8d
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rsi
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    xorl %r9d, %r9d
-; AVX512-NEXT:    shldq %cl, %rdx, %r9
-; AVX512-NEXT:    testb $64, %cl
-; AVX512-NEXT:    cmovneq %rsi, %r8
-; AVX512-NEXT:    cmovneq %rax, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rcx
-; AVX512-NEXT:    cmovneq %rcx, %r9
-; AVX512-NEXT:    cmovneq %rax, %rcx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    movq 8(%rdi), %rdx
-; AVX512-NEXT:    andnq %rdx, %r8, %r10
-; AVX512-NEXT:    andq %r8, %rdx
-; AVX512-NEXT:    andnq %rax, %rsi, %r8
-; AVX512-NEXT:    andq %rsi, %rax
-; AVX512-NEXT:    orq %r9, %r10
-; AVX512-NEXT:    orq %rcx, %r8
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    movq %r8, (%rdi)
-; AVX512-NEXT:    movq %r10, 8(%rdi)
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i128:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    andl $96, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 127
   %ofs = zext nneg i32 %rem to i128
   %bit = shl nuw i128 1, %ofs
@@ -1252,344 +648,25 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $224, %esp
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %eax
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    andl 8(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %ebx, %edi
-; X86-NEXT:    andl 44(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 60(%edi), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 28(%edi), %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edx), %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    negl %edx
-; X86-NEXT:    movl 192(%esp,%edx), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    andl 32(%ebx), %ecx
-; X86-NEXT:    andl (%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    andl 16(%ebx), %edi
-; X86-NEXT:    andl 48(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 4(%ebx), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    andl 52(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    andl $60, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq -48(%rsp,%rbx), %rdx
-; SSE-NEXT:    movq -40(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq -16(%rsp,%rbx), %r11
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r10
-; SSE-NEXT:    shldq %cl, %r11, %r10
-; SSE-NEXT:    movq -32(%rsp,%rbx), %r9
-; SSE-NEXT:    movq -24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r8
-; SSE-NEXT:    shldq %cl, %r9, %r8
-; SSE-NEXT:    movq -56(%rsp,%rbx), %rsi
-; SSE-NEXT:    shldq %cl, %rsi, %rdx
-; SSE-NEXT:    shldq %cl, %r15, %r11
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -64(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %rsi
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 48(%rdi), %r11
-; SSE-NEXT:    andq 16(%rdi), %rdx
-; SSE-NEXT:    orq %r11, %rdx
-; SSE-NEXT:    andq 40(%rdi), %r8
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    andq (%rdi), %rbx
-; SSE-NEXT:    orq %r9, %rbx
-; SSE-NEXT:    orq %rdx, %rbx
-; SSE-NEXT:    andq 8(%rdi), %rsi
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rax, %rsi
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq -48(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq -40(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq -8(%rsp,%rsi), %r10
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq -32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq -24(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r8
-; AVX2-NEXT:    shldq %cl, %r9, %r8
-; AVX2-NEXT:    movq -64(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rsi), %rsi
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    shldq %cl, %rbx, %r9
-; AVX2-NEXT:    shldq %cl, %r15, %rsi
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    andq 32(%rdi), %r9
-; AVX2-NEXT:    andq 48(%rdi), %r11
-; AVX2-NEXT:    andq 16(%rdi), %rdx
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    andq 56(%rdi), %r10
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %rdx
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq (%rdi), %rcx
-; AVX2-NEXT:    orq %r9, %rcx
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rax, %rsi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r10
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r8
-; AVX512-NEXT:    shldq %cl, %r9, %r8
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %rsi, %rdx
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    shldq %cl, %r14, %r9
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rsi
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    andq 32(%rdi), %r9
-; AVX512-NEXT:    andq 48(%rdi), %r11
-; AVX512-NEXT:    andq 16(%rdi), %rdx
-; AVX512-NEXT:    andq 40(%rdi), %r8
-; AVX512-NEXT:    andq 56(%rdi), %r10
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rdx
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    andq (%rdi), %rcx
-; AVX512-NEXT:    orq %r9, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    andq 8(%rdi), %rsi
-; AVX512-NEXT:    orq %r8, %rsi
-; AVX512-NEXT:    orq %rax, %rsi
-; AVX512-NEXT:    orq %rcx, %rsi
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    andl $60, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -1602,572 +679,33 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: complement_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btcl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: complement_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rcx, %r10
-; SSE-NEXT:    xorq %r14, %r9
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    xorq %rdx, %r11
-; SSE-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: complement_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %rax, %r10
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    xorq %r15, %r11
-; AVX2-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: complement_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %rax, %r10
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %r15, %r11
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: complement_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btcl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -2182,606 +720,33 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: reset_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $288, %esp # imm = 0x120
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    subl %eax, %edi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 4(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edi), %eax
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shldl %cl, %edx, %ebx
-; X86-NEXT:    movl 12(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edi), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edi), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edi), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edi), %esi
-; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl 52(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %esi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 256(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    movl 32(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    orl %ebx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    andl %edi, %ecx
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl 52(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl %edx, 60(%eax)
-; X86-NEXT:    movl %esi, 56(%eax)
-; X86-NEXT:    movl %ecx, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 24(%eax)
-; X86-NEXT:    movl %ebx, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 8(%eax)
-; X86-NEXT:    movl %edi, 4(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_eq_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rdx
-; SSE-NEXT:    movq (%rsp,%rdx), %r9
-; SSE-NEXT:    movq 8(%rsp,%rdx), %r8
-; SSE-NEXT:    movq %r8, %rsi
-; SSE-NEXT:    shldq %cl, %r9, %rsi
-; SSE-NEXT:    movq -8(%rsp,%rdx), %rax
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    movq 16(%rsp,%rdx), %r14
-; SSE-NEXT:    movq 24(%rsp,%rdx), %r10
-; SSE-NEXT:    movq %r10, %rbx
-; SSE-NEXT:    shldq %cl, %r14, %rbx
-; SSE-NEXT:    shldq %cl, %r8, %r14
-; SSE-NEXT:    movq 32(%rsp,%rdx), %r13
-; SSE-NEXT:    movq 40(%rsp,%rdx), %r12
-; SSE-NEXT:    shldq %cl, %r13, %r12
-; SSE-NEXT:    shldq %cl, %r10, %r13
-; SSE-NEXT:    movq -16(%rsp,%rdx), %rdx
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r12, %rbp
-; SSE-NEXT:    movq %r9, %r15
-; SSE-NEXT:    movq %rsi, %r11
-; SSE-NEXT:    movq 16(%rdi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r13
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    orq %r13, %r9
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r12
-; SSE-NEXT:    movq 24(%rdi), %r10
-; SSE-NEXT:    andq %r10, %rsi
-; SSE-NEXT:    orq %r12, %rsi
-; SSE-NEXT:    movq %r14, %r13
-; SSE-NEXT:    movq 32(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    movq %rdx, %r12
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rcx, %rdx
-; SSE-NEXT:    orq %r14, %rdx
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    movq %rbx, %r14
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    andq %rcx, %rbx
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq 8(%rdi), %r8
-; SSE-NEXT:    andq %r8, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq %r10, %r11
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq %rcx, %r14
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    notq %rcx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    notq %r9
-; SSE-NEXT:    andq %r8, %r9
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rcx, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r13, 32(%rdi)
-; SSE-NEXT:    movq %r14, 40(%rdi)
-; SSE-NEXT:    movq %r15, 16(%rdi)
-; SSE-NEXT:    movq %r11, 24(%rdi)
-; SSE-NEXT:    movq %r12, (%rdi)
-; SSE-NEXT:    movq %r9, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: reset_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    pushq %rax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rdx
-; AVX2-NEXT:    movq -48(%rsp,%rdx), %r8
-; AVX2-NEXT:    movq -40(%rsp,%rdx), %rbx
-; AVX2-NEXT:    movq %rbx, %rax
-; AVX2-NEXT:    shldq %cl, %r8, %rax
-; AVX2-NEXT:    movq -16(%rsp,%rdx), %r10
-; AVX2-NEXT:    movq -8(%rsp,%rdx), %rsi
-; AVX2-NEXT:    shldq %cl, %r10, %rsi
-; AVX2-NEXT:    movq -32(%rsp,%rdx), %r11
-; AVX2-NEXT:    movq -24(%rsp,%rdx), %r14
-; AVX2-NEXT:    movq %r14, %r9
-; AVX2-NEXT:    shldq %cl, %r11, %r9
-; AVX2-NEXT:    movq -64(%rsp,%rdx), %r15
-; AVX2-NEXT:    movq -56(%rsp,%rdx), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, %r8
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    shldq %cl, %rbx, %r11
-; AVX2-NEXT:    shldq %cl, %r15, %rdx
-; AVX2-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rbx
-; AVX2-NEXT:    movq 56(%rdi), %r14
-; AVX2-NEXT:    movq 16(%rdi), %r15
-; AVX2-NEXT:    movq 48(%rdi), %r13
-; AVX2-NEXT:    movq 32(%rdi), %rbp
-; AVX2-NEXT:    andnq %rbp, %r11, %r12
-; AVX2-NEXT:    andq %r11, %rbp
-; AVX2-NEXT:    andnq %r13, %r10, %r11
-; AVX2-NEXT:    andq %r10, %r13
-; AVX2-NEXT:    andnq %r15, %r8, %r10
-; AVX2-NEXT:    andq %r8, %r15
-; AVX2-NEXT:    movq 40(%rdi), %r8
-; AVX2-NEXT:    orq %r13, %r15
-; AVX2-NEXT:    andnq %r8, %r9, %r13
-; AVX2-NEXT:    andq %r9, %r8
-; AVX2-NEXT:    andnq %r14, %rsi, %r9
-; AVX2-NEXT:    andq %rsi, %r14
-; AVX2-NEXT:    andnq %rbx, %rax, %rsi
-; AVX2-NEXT:    andq %rax, %rbx
-; AVX2-NEXT:    movq (%rdi), %rax
-; AVX2-NEXT:    orq %r14, %rbx
-; AVX2-NEXT:    andnq %rax, %rcx, %r14
-; AVX2-NEXT:    andq %rcx, %rax
-; AVX2-NEXT:    orq %rbp, %rax
-; AVX2-NEXT:    movq 8(%rdi), %rcx
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    andnq %rcx, %rdx, %r15
-; AVX2-NEXT:    andq %rdx, %rcx
-; AVX2-NEXT:    orq %r8, %rcx
-; AVX2-NEXT:    orq %rbx, %rcx
-; AVX2-NEXT:    orq %rax, %rcx
-; AVX2-NEXT:    movq %r11, 48(%rdi)
-; AVX2-NEXT:    movq %r9, 56(%rdi)
-; AVX2-NEXT:    movq %r12, 32(%rdi)
-; AVX2-NEXT:    movq %r13, 40(%rdi)
-; AVX2-NEXT:    movq %r10, 16(%rdi)
-; AVX2-NEXT:    movq %rsi, 24(%rdi)
-; AVX2-NEXT:    movq %r14, (%rdi)
-; AVX2-NEXT:    movq %r15, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $8, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: reset_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    pushq %rax
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq -48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq -40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    shldq %cl, %r8, %rax
-; AVX512-NEXT:    movq -16(%rsp,%rbx), %r10
-; AVX512-NEXT:    movq -8(%rsp,%rbx), %rsi
-; AVX512-NEXT:    shldq %cl, %r10, %rsi
-; AVX512-NEXT:    movq -32(%rsp,%rbx), %r11
-; AVX512-NEXT:    movq -24(%rsp,%rbx), %r15
-; AVX512-NEXT:    movq %r15, %r9
-; AVX512-NEXT:    shldq %cl, %r11, %r9
-; AVX512-NEXT:    movq -56(%rsp,%rbx), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, %r8
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    shldq %cl, %r14, %r11
-; AVX512-NEXT:    movq -64(%rsp,%rbx), %rbx
-; AVX512-NEXT:    shldq %cl, %rbx, %rdx
-; AVX512-NEXT:    shlxq %rcx, %rbx, %rcx
-; AVX512-NEXT:    movq 24(%rdi), %rbx
-; AVX512-NEXT:    movq 56(%rdi), %r14
-; AVX512-NEXT:    movq 16(%rdi), %r15
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r11, %r12
-; AVX512-NEXT:    andq %r11, %rbp
-; AVX512-NEXT:    andnq %r13, %r10, %r11
-; AVX512-NEXT:    andq %r10, %r13
-; AVX512-NEXT:    andnq %r15, %r8, %r10
-; AVX512-NEXT:    andq %r8, %r15
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r15
-; AVX512-NEXT:    andnq %r8, %r9, %r13
-; AVX512-NEXT:    andq %r9, %r8
-; AVX512-NEXT:    andnq %r14, %rsi, %r9
-; AVX512-NEXT:    andq %rsi, %r14
-; AVX512-NEXT:    andnq %rbx, %rax, %rsi
-; AVX512-NEXT:    andq %rax, %rbx
-; AVX512-NEXT:    movq (%rdi), %rax
-; AVX512-NEXT:    orq %r14, %rbx
-; AVX512-NEXT:    andnq %rax, %rcx, %r14
-; AVX512-NEXT:    andq %rcx, %rax
-; AVX512-NEXT:    orq %rbp, %rax
-; AVX512-NEXT:    movq 8(%rdi), %rcx
-; AVX512-NEXT:    orq %r15, %rax
-; AVX512-NEXT:    andnq %rcx, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rcx
-; AVX512-NEXT:    orq %r8, %rcx
-; AVX512-NEXT:    orq %rbx, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    movq %r11, 48(%rdi)
-; AVX512-NEXT:    movq %r9, 56(%rdi)
-; AVX512-NEXT:    movq %r12, 32(%rdi)
-; AVX512-NEXT:    movq %r13, 40(%rdi)
-; AVX512-NEXT:    movq %r10, 16(%rdi)
-; AVX512-NEXT:    movq %rsi, 24(%rdi)
-; AVX512-NEXT:    movq %r14, (%rdi)
-; AVX512-NEXT:    movq %r15, 8(%rdi)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $8, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: reset_eq_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setae %al
+; X64-NEXT:    btrl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -2797,572 +762,33 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
 define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: set_ne_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $272, %esp # imm = 0x110
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %eax
-; X86-NEXT:    shrl $3, %eax
-; X86-NEXT:    andl $60, %eax
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 24(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%edx), %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%edx), %ebx
-; X86-NEXT:    movl %ebx, %esi
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%edx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 52(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl 40(%edx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    movl 8(%edx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl 56(%edx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %ebx
-; X86-NEXT:    movl 24(%edx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%eax), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%eax), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    movl 60(%eax), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl (%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    negl %eax
-; X86-NEXT:    movl 240(%esp,%eax), %esi
-; X86-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl 32(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    movl (%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl 16(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl (%esp), %edx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%esi), %ebx
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%esi), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    movl 52(%eax), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    movl %ebx, 60(%edx)
-; X86-NEXT:    movl %edi, 56(%edx)
-; X86-NEXT:    movl %ecx, 52(%edx)
-; X86-NEXT:    movl %esi, 44(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 40(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 36(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 32(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 28(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 24(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 20(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 16(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 12(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 8(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 4(%edx)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, (%edx)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, 48(%edx)
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    btsl %edx, %edi
+; X86-NEXT:    movl %edi, (%ecx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: set_ne_i512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $56, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %rbx
-; SSE-NEXT:    movq (%rsp,%rbx), %rsi
-; SSE-NEXT:    movq 8(%rsp,%rbx), %r14
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    shldq %cl, %rsi, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rsp,%rbx), %r8
-; SSE-NEXT:    movq 40(%rsp,%rbx), %rbp
-; SSE-NEXT:    shldq %cl, %r8, %rbp
-; SSE-NEXT:    movq 16(%rsp,%rbx), %r9
-; SSE-NEXT:    movq 24(%rsp,%rbx), %r15
-; SSE-NEXT:    movq %r15, %r10
-; SSE-NEXT:    shldq %cl, %r9, %r10
-; SSE-NEXT:    movq -8(%rsp,%rbx), %r11
-; SSE-NEXT:    shldq %cl, %r11, %rsi
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    shldq %cl, %r14, %r9
-; SSE-NEXT:    movq -16(%rsp,%rbx), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, %r11
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rbx
-; SSE-NEXT:    movq 24(%rdi), %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 16(%rdi), %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %r8, %r13
-; SSE-NEXT:    andq %rsi, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %rcx, %r13
-; SSE-NEXT:    andq %rbp, %r13
-; SSE-NEXT:    andq %rax, %r15
-; SSE-NEXT:    orq %r13, %r15
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    movq %r14, %rcx
-; SSE-NEXT:    andq %r9, %rcx
-; SSE-NEXT:    movq (%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rbx, %r13
-; SSE-NEXT:    orq %rcx, %r13
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    movq 40(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, %r12
-; SSE-NEXT:    andq %r10, %r12
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, %rax
-; SSE-NEXT:    andq %r11, %rax
-; SSE-NEXT:    orq %r12, %rax
-; SSE-NEXT:    orq %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %r10
-; SSE-NEXT:    orq %r14, %r9
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    movq %r8, 48(%rdi)
-; SSE-NEXT:    movq %rbp, 56(%rdi)
-; SSE-NEXT:    movq %r9, 32(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %rsi, 16(%rdi)
-; SSE-NEXT:    movq %r15, 24(%rdi)
-; SSE-NEXT:    movq %rbx, (%rdi)
-; SSE-NEXT:    movq %r11, 8(%rdi)
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $56, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: set_ne_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $72, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, (%rsp)
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rbx
-; AVX2-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX2-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX2-NEXT:    movq %rbp, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX2-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX2-NEXT:    shldq %cl, %r8, %r13
-; AVX2-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX2-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    shldq %cl, %r9, %r10
-; AVX2-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX2-NEXT:    shldq %cl, %r11, %rsi
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r8, %r14
-; AVX2-NEXT:    andq %rsi, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq 56(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r13, %r15
-; AVX2-NEXT:    movq 24(%rdi), %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %rax, %r14
-; AVX2-NEXT:    orq %r15, %r14
-; AVX2-NEXT:    shldq %cl, %rbp, %r9
-; AVX2-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX2-NEXT:    movq 32(%rdi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r9, %r15
-; AVX2-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq (%rdi), %rbx
-; AVX2-NEXT:    movq %rbx, %rbp
-; AVX2-NEXT:    andq %rax, %rbp
-; AVX2-NEXT:    orq %r15, %rbp
-; AVX2-NEXT:    orq %r12, %rbp
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    movq %rax, %rcx
-; AVX2-NEXT:    andq %r10, %rcx
-; AVX2-NEXT:    movq 8(%rdi), %r15
-; AVX2-NEXT:    movq %r15, %r12
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    orq %rcx, %r12
-; AVX2-NEXT:    orq %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rax, %r10
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r15, %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    movq %r8, 48(%rdi)
-; AVX2-NEXT:    movq %r13, 56(%rdi)
-; AVX2-NEXT:    movq %r9, 32(%rdi)
-; AVX2-NEXT:    movq %r10, 40(%rdi)
-; AVX2-NEXT:    movq %rsi, 16(%rdi)
-; AVX2-NEXT:    movq %rcx, 24(%rdi)
-; AVX2-NEXT:    movq %rbx, (%rdi)
-; AVX2-NEXT:    movq %r11, 8(%rdi)
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $72, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: set_ne_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $72, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rbx
-; AVX512-NEXT:    movq 16(%rsp,%rbx), %rsi
-; AVX512-NEXT:    movq 24(%rsp,%rbx), %rbp
-; AVX512-NEXT:    movq %rbp, %rax
-; AVX512-NEXT:    shldq %cl, %rsi, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rsp,%rbx), %r8
-; AVX512-NEXT:    movq 56(%rsp,%rbx), %r13
-; AVX512-NEXT:    shldq %cl, %r8, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rbx), %r9
-; AVX512-NEXT:    movq 40(%rsp,%rbx), %r14
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    shldq %cl, %r9, %r10
-; AVX512-NEXT:    movq 8(%rsp,%rbx), %r11
-; AVX512-NEXT:    shldq %cl, %r11, %rsi
-; AVX512-NEXT:    shldq %cl, %r14, %r8
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 48(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r8, %r14
-; AVX512-NEXT:    andq %rsi, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq 56(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r13, %r15
-; AVX512-NEXT:    movq 24(%rdi), %r14
-; AVX512-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %rax, %r14
-; AVX512-NEXT:    orq %r15, %r14
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    movq (%rsp,%rbx), %rdx
-; AVX512-NEXT:    movq 32(%rdi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r9, %r15
-; AVX512-NEXT:    shlxq %rcx, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rdi), %rbx
-; AVX512-NEXT:    movq %rbx, %rbp
-; AVX512-NEXT:    andq %rax, %rbp
-; AVX512-NEXT:    orq %r15, %rbp
-; AVX512-NEXT:    orq %r12, %rbp
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rdx, %r11
-; AVX512-NEXT:    movq 40(%rdi), %rax
-; AVX512-NEXT:    movq %rax, %rcx
-; AVX512-NEXT:    andq %r10, %rcx
-; AVX512-NEXT:    movq 8(%rdi), %r15
-; AVX512-NEXT:    movq %r15, %r12
-; AVX512-NEXT:    andq %r11, %r12
-; AVX512-NEXT:    orq %rcx, %r12
-; AVX512-NEXT:    orq %r14, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r15, %r11
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512-NEXT:    orq %rbp, %r12
-; AVX512-NEXT:    movq %r8, 48(%rdi)
-; AVX512-NEXT:    movq %r13, 56(%rdi)
-; AVX512-NEXT:    movq %r9, 32(%rdi)
-; AVX512-NEXT:    movq %r10, 40(%rdi)
-; AVX512-NEXT:    movq %rsi, 16(%rdi)
-; AVX512-NEXT:    movq %rcx, 24(%rdi)
-; AVX512-NEXT:    movq %rbx, (%rdi)
-; AVX512-NEXT:    movq %r11, 8(%rdi)
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $72, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: set_ne_i512:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $60, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %edx
+; X64-NEXT:    btl %esi, %edx
+; X64-NEXT:    setb %al
+; X64-NEXT:    btsl %esi, %edx
+; X64-NEXT:    movl %edx, (%rdi,%rcx)
+; X64-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -3377,883 +803,55 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
 define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 ; X86-LABEL: init_eq_i512:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $432, %esp # imm = 0x1B0
-; X86-NEXT:    movl 12(%ebp), %ecx
-; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    shrl $3, %edx
-; X86-NEXT:    andl $60, %edx
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 56(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl 48(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%esi), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%esi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl 16(%ebp), %ebx
-; X86-NEXT:    movzbl %bl, %esi
-; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    subl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl %cl, %edi, %edx
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 8(%ebp), %ebx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%ebx), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%ebx), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl (%ebx), %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    movl %edx, %eax
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 4(%ebx), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl 56(%edi), %ebx
-; X86-NEXT:    movl 60(%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 52(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 48(%edi), %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 40(%edi), %ebx
-; X86-NEXT:    movl 44(%edi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 36(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 32(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 28(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 24(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 20(%edi), %eax
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 16(%edi), %ebx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl 12(%edi), %eax
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    notl %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%edi), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    notl %eax
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 4(%edi), %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %ebx, %edx
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %esi, %eax
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    movl (%edi), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %ebx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    andl $60, %esi
+; X86-NEXT:    movl (%edx,%esi), %edi
+; X86-NEXT:    btl %ecx, %edi
+; X86-NEXT:    setae %al
+; X86-NEXT:    btrl %ecx, %edi
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 60(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 56(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 52(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 44(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 40(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 36(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 32(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 28(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 24(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 20(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 16(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 12(%eax)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl %edi, 8(%eax)
-; X86-NEXT:    movl %edx, 4(%eax)
-; X86-NEXT:    movl %ecx, (%eax)
-; X86-NEXT:    movl %esi, 48(%eax)
-; X86-NEXT:    sete %al
-; X86-NEXT:    leal -12(%ebp), %esp
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %ebx, (%edx,%esi)
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
 ; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
 ; SSE-LABEL: init_eq_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $216, %rsp
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
 ; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    negl %esi
-; SSE-NEXT:    movslq %esi, %r10
-; SSE-NEXT:    movq 184(%rsp,%r10), %r11
-; SSE-NEXT:    movq 192(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r13
-; SSE-NEXT:    shldq %cl, %r11, %r13
-; SSE-NEXT:    movq 200(%rsp,%r10), %r15
-; SSE-NEXT:    shldq %cl, %rsi, %r15
-; SSE-NEXT:    movq 168(%rsp,%r10), %rbx
-; SSE-NEXT:    movq 176(%rsp,%r10), %rsi
-; SSE-NEXT:    movq %rsi, %r14
-; SSE-NEXT:    shldq %cl, %rbx, %r14
-; SSE-NEXT:    shldq %cl, %rsi, %r11
-; SSE-NEXT:    movq 152(%rsp,%r10), %rax
-; SSE-NEXT:    movq 160(%rsp,%r10), %r8
-; SSE-NEXT:    movq %r8, %r12
-; SSE-NEXT:    shldq %cl, %rax, %r12
-; SSE-NEXT:    shldq %cl, %r8, %rbx
-; SSE-NEXT:    movq 144(%rsp,%r10), %r9
-; SSE-NEXT:    movq %r9, %r8
-; SSE-NEXT:    shlq %cl, %r8
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movl %edx, %edx
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, (%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq %rdx, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rsi, %r13
-; SSE-NEXT:    andq %rdx, %r12
-; SSE-NEXT:    orq %r13, %r12
-; SSE-NEXT:    movq %r15, %rsi
-; SSE-NEXT:    movq 56(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r15
-; SSE-NEXT:    movq %rbx, %r13
-; SSE-NEXT:    movq 24(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    movq %r14, %rbp
-; SSE-NEXT:    movq 32(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r14
-; SSE-NEXT:    movq %r8, %r15
-; SSE-NEXT:    movq (%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %r8
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    orq %r12, %r8
-; SSE-NEXT:    movq %r11, %r12
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    andq %r9, %r11
-; SSE-NEXT:    movq %rax, %r14
-; SSE-NEXT:    movq 8(%rdi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq %rdx, %rax
-; SSE-NEXT:    orq %r11, %rax
-; SSE-NEXT:    orq %rbx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    notq %rax
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq 56(%rsp,%r10), %r11
-; SSE-NEXT:    movq 64(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rbx
-; SSE-NEXT:    shldq %cl, %r11, %rbx
-; SSE-NEXT:    orq %rbx, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    movq 72(%rsp,%r10), %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    notq %rbp
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
-; SSE-NEXT:    movq 40(%rsp,%r10), %rax
-; SSE-NEXT:    movq 48(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    orq %rbx, %rbp
-; SSE-NEXT:    notq %r12
-; SSE-NEXT:    andq %r9, %r12
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq 24(%rsp,%r10), %r9
-; SSE-NEXT:    movq 32(%rsp,%r10), %rdx
-; SSE-NEXT:    movq %rdx, %rbx
-; SSE-NEXT:    shldq %cl, %r9, %rbx
-; SSE-NEXT:    orq %r11, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    notq %r11
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    orq %rbx, %r11
-; SSE-NEXT:    notq %r13
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    notq %r15
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    movq 16(%rsp,%r10), %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    shlq %cl, %rdx
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    notq %r14
-; SSE-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shldq %cl, %rax, %r9
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq %rax, 48(%rdi)
-; SSE-NEXT:    movq %rsi, 56(%rdi)
-; SSE-NEXT:    movq %rbp, 32(%rdi)
-; SSE-NEXT:    movq %r12, 40(%rdi)
-; SSE-NEXT:    movq %r11, 16(%rdi)
-; SSE-NEXT:    movq %r13, 24(%rdi)
-; SSE-NEXT:    movq %r15, (%rdi)
-; SSE-NEXT:    movq %r14, 8(%rdi)
-; SSE-NEXT:    sete %al
-; SSE-NEXT:    addq $216, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    andl $60, %esi
+; SSE-NEXT:    movl (%rdi,%rsi), %r8d
+; SSE-NEXT:    btl %ecx, %r8d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    shll %cl, %edx
+; SSE-NEXT:    btrl %ecx, %r8d
+; SSE-NEXT:    orl %r8d, %edx
+; SSE-NEXT:    movl %edx, (%rdi,%rsi)
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: init_eq_i512:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $200, %rsp
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %esi, %r8d
-; AVX2-NEXT:    andl $63, %r8d
-; AVX2-NEXT:    shrl $3, %esi
-; AVX2-NEXT:    andl $56, %esi
-; AVX2-NEXT:    negl %esi
-; AVX2-NEXT:    movslq %esi, %rsi
-; AVX2-NEXT:    movq 144(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 152(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq %r12, %r10
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %r10
-; AVX2-NEXT:    movq 176(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq 184(%rsp,%rsi), %r9
-; AVX2-NEXT:    shldq %cl, %r14, %r9
-; AVX2-NEXT:    movq 160(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq 168(%rsp,%rsi), %r13
-; AVX2-NEXT:    movq %r13, %rbx
-; AVX2-NEXT:    shldq %cl, %r15, %rbx
-; AVX2-NEXT:    movq 128(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 136(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r13, %r14
-; AVX2-NEXT:    shldq %cl, %r12, %r15
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movl %edx, %edx
-; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq %rdx, (%rsp)
-; AVX2-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movq 16(%rdi), %r12
-; AVX2-NEXT:    movq 48(%rdi), %rbp
-; AVX2-NEXT:    movq 32(%rdi), %r13
-; AVX2-NEXT:    andnq %r13, %r15, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r15, %r13
-; AVX2-NEXT:    andnq %rbp, %r14, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r14, %rbp
-; AVX2-NEXT:    andnq %r12, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r11, %r12
-; AVX2-NEXT:    movq 40(%rdi), %rax
-; AVX2-NEXT:    orq %rbp, %r12
-; AVX2-NEXT:    andnq %rax, %rbx, %rcx
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rax, %rbp
-; AVX2-NEXT:    andq %rbx, %rbp
-; AVX2-NEXT:    movq 56(%rdi), %rcx
-; AVX2-NEXT:    andnq %rcx, %r9, %rbx
-; AVX2-NEXT:    andq %r9, %rcx
-; AVX2-NEXT:    movq 24(%rdi), %rax
-; AVX2-NEXT:    andnq %rax, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq %r10, %rax
-; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    movq (%rdi), %r10
-; AVX2-NEXT:    andnq %r10, %rcx, %r15
-; AVX2-NEXT:    andq %rcx, %r10
-; AVX2-NEXT:    movq 40(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 48(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq %r11, %r9
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %rdx, %r9
-; AVX2-NEXT:    orq %r13, %r10
-; AVX2-NEXT:    orq %r12, %r10
-; AVX2-NEXT:    movq 8(%rdi), %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andnq %r13, %rcx, %r12
-; AVX2-NEXT:    andq %rcx, %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq 56(%rsp,%rsi), %rax
-; AVX2-NEXT:    movl %r8d, %ecx
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 24(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 32(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rax, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    orq %rdx, %rbx
-; AVX2-NEXT:    movq 8(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq 16(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, %r11
-; AVX2-NEXT:    shldq %cl, %rdx, %r11
-; AVX2-NEXT:    shldq %cl, %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq (%rsp,%rsi), %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    shlxq %r8, %rsi, %rax
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    orq %rax, %r15
-; AVX2-NEXT:    orq %rdx, %r12
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    movq %r14, 48(%rdi)
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    movq %rax, 56(%rdi)
-; AVX2-NEXT:    movq %rbp, 32(%rdi)
-; AVX2-NEXT:    movq %rbx, 40(%rdi)
-; AVX2-NEXT:    movq %r9, 16(%rdi)
-; AVX2-NEXT:    movq %r11, 24(%rdi)
-; AVX2-NEXT:    movq %r15, (%rdi)
-; AVX2-NEXT:    movq %r12, 8(%rdi)
-; AVX2-NEXT:    sete %al
-; AVX2-NEXT:    addq $200, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: init_eq_i512:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $184, %rsp
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %esi
-; AVX512-NEXT:    andl $56, %esi
-; AVX512-NEXT:    negl %esi
-; AVX512-NEXT:    movslq %esi, %rsi
-; AVX512-NEXT:    movq 128(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 136(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rax
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 160(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 168(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 144(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq 152(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq %r11, %rbx
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq 120(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %r10
-; AVX512-NEXT:    shldq %cl, %r11, %r14
-; AVX512-NEXT:    movq %rdi, %r9
-; AVX512-NEXT:    movq 112(%rsp,%rsi), %r11
-; AVX512-NEXT:    shldq %cl, %r12, %r15
-; AVX512-NEXT:    movl %edx, %edx
-; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vmovups %xmm1, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    movq 16(%rdi), %r12
-; AVX512-NEXT:    movq 48(%rdi), %r13
-; AVX512-NEXT:    movq 32(%rdi), %rbp
-; AVX512-NEXT:    andnq %rbp, %r15, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r15, %rbp
-; AVX512-NEXT:    andnq %r13, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r14, %r13
-; AVX512-NEXT:    andnq %r12, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq %r10, %r12
-; AVX512-NEXT:    movq 40(%rdi), %r8
-; AVX512-NEXT:    orq %r13, %r12
-; AVX512-NEXT:    andnq %r8, %rbx, %rdi
-; AVX512-NEXT:    andq %rbx, %r8
-; AVX512-NEXT:    movq 56(%r9), %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %r13, %rdx, %r10
-; AVX512-NEXT:    andq %rdx, %r13
-; AVX512-NEXT:    movq 24(%r9), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; AVX512-NEXT:    andnq %rax, %rdx, %r15
-; AVX512-NEXT:    andq %rdx, %rax
-; AVX512-NEXT:    orq %r13, %rax
-; AVX512-NEXT:    shlxq %rcx, %r11, %r13
-; AVX512-NEXT:    movq (%r9), %rdx
-; AVX512-NEXT:    andnq %rdx, %r13, %r14
-; AVX512-NEXT:    andq %r13, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r11, %rbp
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    movq 8(%r9), %r13
-; AVX512-NEXT:    andnq %r13, %rbp, %rbx
-; AVX512-NEXT:    andq %rbp, %r13
-; AVX512-NEXT:    orq %r8, %r13
-; AVX512-NEXT:    movq 24(%rsp,%rsi), %r8
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    movq 32(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, %r12
-; AVX512-NEXT:    shldq %cl, %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    orq %r12, %r11
-; AVX512-NEXT:    movq 40(%rsp,%rsi), %r12
-; AVX512-NEXT:    shldq %cl, %rax, %r12
-; AVX512-NEXT:    orq %r12, %r10
-; AVX512-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 8(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 16(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %rax, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %r10
-; AVX512-NEXT:    shldq %cl, %r12, %r8
-; AVX512-NEXT:    orq %r8, %rdi
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq -8(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq (%rsp,%rsi), %r12
-; AVX512-NEXT:    movq %r12, %rbp
-; AVX512-NEXT:    shldq %cl, %r8, %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    orq %rbp, %rdi
-; AVX512-NEXT:    movq -16(%rsp,%rsi), %rsi
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rsi, %r8
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %rdx, %r13
-; AVX512-NEXT:    movq %r11, 48(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 56(%r9)
-; AVX512-NEXT:    movq %r10, 32(%r9)
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, 40(%r9)
-; AVX512-NEXT:    movq %rdi, 16(%r9)
-; AVX512-NEXT:    movq %r15, 24(%r9)
-; AVX512-NEXT:    movq %r14, (%r9)
-; AVX512-NEXT:    movq %rbx, 8(%r9)
-; AVX512-NEXT:    sete %al
-; AVX512-NEXT:    addq $184, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: init_eq_i512:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movl %esi, %ecx
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $60, %ecx
+; AVX-NEXT:    movl (%rdi,%rcx), %r8d
+; AVX-NEXT:    btl %esi, %r8d
+; AVX-NEXT:    setae %al
+; AVX-NEXT:    btrl %esi, %r8d
+; AVX-NEXT:    shlxl %esi, %edx, %edx
+; AVX-NEXT:    orl %r8d, %edx
+; AVX-NEXT:    movl %edx, (%rdi,%rcx)
+; AVX-NEXT:    retq
   %rem = and i32 %position, 511
   %ofs = zext nneg i32 %rem to i512
   %bit = shl nuw i512 1, %ofs
@@ -4274,2749 +872,25 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
 define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
 ; X86-LABEL: test_ne_i4096:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebp
-; X86-NEXT:    movl %esp, %ebp
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
-; X86-NEXT:    pushl %esi
-; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $1792, %esp # imm = 0x700
-; X86-NEXT:    movl 12(%ebp), %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shrl $3, %ecx
-; X86-NEXT:    andl $508, %ecx # imm = 0x1FC
-; X86-NEXT:    leal {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    subl %ecx, %esi
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl 248(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 252(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl $31, %ebx
-; X86-NEXT:    movl %ebx, %ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 504(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 508(%esi), %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 120(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 124(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 376(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 380(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 184(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 188(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 440(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 444(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 56(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 60(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 312(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 316(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 216(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 220(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 472(%esi), %edi
-; X86-NEXT:    movl 476(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 92(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 344(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 348(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 152(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 156(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 408(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 412(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 24(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 28(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 280(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 284(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 232(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 236(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 488(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 492(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 104(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 108(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 360(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 364(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 168(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 172(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 424(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 428(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 40(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 44(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 296(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 300(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 200(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 204(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 456(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 460(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 76(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 328(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 332(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 136(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 140(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 392(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 396(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 12(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 264(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 268(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 240(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 244(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 496(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 500(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 112(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 116(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 368(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 372(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 176(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 180(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 432(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 436(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 52(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 304(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 308(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 208(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 212(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 464(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 468(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 80(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 84(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 336(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 340(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 144(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 148(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 400(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 404(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 16(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 20(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 272(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 276(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 224(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 228(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 480(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 484(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 96(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 100(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 352(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 356(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 160(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 164(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 416(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 420(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 32(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 36(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 288(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 292(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 192(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 196(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 448(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 452(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 64(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 68(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 320(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 324(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 128(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 132(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    movl 256(%esi), %edi
-; X86-NEXT:    movl 260(%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 388(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl 4(%esi), %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shrdl $1, %eax, %edi
-; X86-NEXT:    shrl %eax
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    notb %cl
-; X86-NEXT:    shrdl %cl, %eax, %edi
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movb $32, %cl
-; X86-NEXT:    testb %cl, %cl
-; X86-NEXT:    movl (%esi), %eax
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 8(%ebp), %eax
-; X86-NEXT:    jne .LBB20_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:  .LBB20_2:
-; X86-NEXT:    movl %edx, %ecx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 320(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 64(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 448(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 192(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 288(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 32(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 416(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 160(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 352(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 96(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 480(%eax), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 224(%eax), %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    orl %esi, %ecx
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 272(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 16(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 400(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 144(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 336(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 80(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 464(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 208(%eax), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 304(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 48(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 432(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 176(%eax), %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 368(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 112(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 496(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    andl 240(%eax), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %esi, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 264(%eax), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 8(%eax), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 392(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 136(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    orl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 328(%ebx), %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 72(%ebx), %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 456(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 200(%ebx), %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 296(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 40(%ebx), %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 424(%ebx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 168(%ebx), %edx
-; X86-NEXT:    orl %edi, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 360(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 104(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 488(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 232(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 280(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 24(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 408(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 152(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 344(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 88(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 472(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 216(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 312(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 56(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 440(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 184(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 376(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 120(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 504(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 248(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 324(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 68(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 452(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 196(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 292(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 36(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 420(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 164(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 356(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 100(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 484(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 228(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 276(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 20(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 404(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 148(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 340(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 84(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 468(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 212(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 308(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 52(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 436(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 180(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 372(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 116(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 500(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 244(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    orl %esi, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 268(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 12(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 396(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 140(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 332(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 76(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 460(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 204(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 300(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 44(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 428(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 172(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 364(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 108(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 492(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    andl 236(%ebx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl %edx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 284(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 28(%ebx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 412(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 156(%ebx), %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 348(%ebx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 92(%ebx), %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 476(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 220(%ebx), %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    orl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 316(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 60(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 444(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    andl 188(%ebx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    orl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 380(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    andl 124(%ebx), %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 508(%ebx), %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    andl 252(%esi), %ebx
-; X86-NEXT:    orl %ecx, %ebx
-; X86-NEXT:    orl %edx, %ebx
-; X86-NEXT:    orl %edi, %ebx
-; X86-NEXT:    orl %eax, %ebx
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    negl %ecx
-; X86-NEXT:    movl 1648(%esp,%ecx), %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edi, %esi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl %cl, %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl 8(%ebp), %edx
-; X86-NEXT:    andl 128(%edx), %ecx
-; X86-NEXT:    andl 384(%edx), %edi
-; X86-NEXT:    orl %ecx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shll %cl, %eax
-; X86-NEXT:    andl (%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 256(%edx), %eax
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 260(%edx), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    andl 4(%edx), %ecx
-; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    andl 132(%edx), %eax
-; X86-NEXT:    andl 388(%edx), %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    orl %ebx, %esi
-; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    setne %al
-; X86-NEXT:    leal -12(%ebp), %esp
-; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
-; X86-NEXT:    popl %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $4064, %edx # imm = 0xFE0
+; X86-NEXT:    shrl $3, %edx
+; X86-NEXT:    movl (%eax,%edx), %eax
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setb %al
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: test_ne_i4096:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    subq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl %esi, %eax
-; SSE-NEXT:    andl $4032, %eax # imm = 0xFC0
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movups %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $0, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq $1, {{[0-9]+}}(%rsp)
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    negl %eax
-; SSE-NEXT:    movslq %eax, %rsi
-; SSE-NEXT:    movq 1296(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1304(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1552(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1560(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1168(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1176(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1424(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1432(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1232(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1240(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1488(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1496(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1104(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1112(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1360(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; SSE-NEXT:    movq 1368(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1272(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1528(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1144(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1400(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1200(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1208(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1464(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1072(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1080(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1336(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1288(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1544(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1160(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1416(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1216(%rsp,%rsi), %r11
-; SSE-NEXT:    movq 1224(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r11, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1480(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1088(%rsp,%rsi), %r9
-; SSE-NEXT:    movq 1096(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %r9, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1352(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1248(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1256(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1512(%rsp,%rsi), %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rdx, %rax
-; SSE-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1120(%rsp,%rsi), %rax
-; SSE-NEXT:    movq 1128(%rsp,%rsi), %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    shldq %cl, %rax, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1376(%rsp,%rsi), %r13
-; SSE-NEXT:    movq 1384(%rsp,%rsi), %rbx
-; SSE-NEXT:    movq %rbx, %r8
-; SSE-NEXT:    shldq %cl, %r13, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1192(%rsp,%rsi), %r15
-; SSE-NEXT:    movq %r15, %r14
-; SSE-NEXT:    shldq %cl, %rdx, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1440(%rsp,%rsi), %r10
-; SSE-NEXT:    movq 1448(%rsp,%rsi), %rdx
-; SSE-NEXT:    movq %rdx, %r14
-; SSE-NEXT:    shldq %cl, %r10, %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1312(%rsp,%rsi), %r14
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 1320(%rsp,%rsi), %rbp
-; SSE-NEXT:    movq %rbp, %r12
-; SSE-NEXT:    shldq %cl, %r14, %r12
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq 1064(%rsp,%rsi), %rbx
-; SSE-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rbp, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r15, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %r12, %r10
-; SSE-NEXT:    andq 384(%rdi), %r10
-; SSE-NEXT:    andq 128(%rdi), %r15
-; SSE-NEXT:    andq 320(%rdi), %r13
-; SSE-NEXT:    andq 64(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r15
-; SSE-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    andq 448(%rdi), %r9
-; SSE-NEXT:    andq 192(%rdi), %rbp
-; SSE-NEXT:    orq %r9, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    andq 288(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 32(%rdi), %r9
-; SSE-NEXT:    andq 416(%rdi), %rdx
-; SSE-NEXT:    andq 160(%rdi), %r11
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 352(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 96(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 480(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 224(%rdi), %r8
-; SSE-NEXT:    orq %rax, %r8
-; SSE-NEXT:    orq %rdx, %r8
-; SSE-NEXT:    andq 272(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 16(%rdi), %rax
-; SSE-NEXT:    orq %r14, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 400(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 144(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 336(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 80(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 464(%rdi), %rdx
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 208(%rdi), %r11
-; SSE-NEXT:    orq %rdx, %r11
-; SSE-NEXT:    orq %rax, %r11
-; SSE-NEXT:    orq %r8, %r11
-; SSE-NEXT:    movq (%rsp), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 304(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 48(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 432(%rdi), %r9
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 176(%rdi), %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 368(%rdi), %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 112(%rdi), %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    movq %r8, %r10
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 496(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; SSE-NEXT:    andq 240(%rdi), %rbp
-; SSE-NEXT:    orq %r8, %rbp
-; SSE-NEXT:    orq %rax, %rbp
-; SSE-NEXT:    orq %r10, %rbp
-; SSE-NEXT:    orq %r11, %rbp
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 392(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE-NEXT:    andq 136(%rdi), %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 328(%rdi), %rdx
-; SSE-NEXT:    orq %rax, %r12
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 72(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 456(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE-NEXT:    andq 200(%rdi), %r13
-; SSE-NEXT:    orq %rax, %r13
-; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 296(%rdi), %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 40(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 424(%rdi), %r8
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %rax, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    andq 168(%rdi), %rdx
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 360(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 104(%rdi), %rax
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 488(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    andq 232(%rdi), %r15
-; SSE-NEXT:    orq %rax, %r15
-; SSE-NEXT:    orq %r8, %r15
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 280(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 24(%rdi), %rax
-; SSE-NEXT:    orq %rdx, %r15
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 408(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 152(%rdi), %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 344(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 88(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 472(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE-NEXT:    andq 216(%rdi), %r14
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    orq %rax, %r14
-; SSE-NEXT:    orq %r8, %r14
-; SSE-NEXT:    orq %r10, %r14
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 312(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 56(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 440(%rdi), %r8
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; SSE-NEXT:    andq 184(%rdi), %r9
-; SSE-NEXT:    orq %r11, %r10
-; SSE-NEXT:    orq %r8, %r9
-; SSE-NEXT:    orq %r10, %r9
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE-NEXT:    shldq %cl, %rax, %rdx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE-NEXT:    andq 376(%rdi), %r10
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE-NEXT:    andq 120(%rdi), %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    andq 504(%rdi), %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    andq 248(%rdi), %r8
-; SSE-NEXT:    orq %r10, %rax
-; SSE-NEXT:    movq %rax, %r10
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    movq 1056(%rsp,%rsi), %rax
-; SSE-NEXT:    shldq %cl, %rax, %rbx
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shlq %cl, %rax
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    andq 256(%rdi), %rdx
-; SSE-NEXT:    orq %r14, %r8
-; SSE-NEXT:    andq (%rdi), %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbp, %rax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    andq 264(%rdi), %rcx
-; SSE-NEXT:    andq 8(%rdi), %rbx
-; SSE-NEXT:    orq %rcx, %rbx
-; SSE-NEXT:    orq %r12, %rbx
-; SSE-NEXT:    orq %r13, %rbx
-; SSE-NEXT:    orq %r15, %rbx
-; SSE-NEXT:    orq %r8, %rbx
-; SSE-NEXT:    orq %rax, %rbx
-; SSE-NEXT:    setne %al
-; SSE-NEXT:    addq $1576, %rsp # imm = 0x628
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
-;
-; AVX2-LABEL: test_ne_i4096:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX2-NEXT:    movl %esi, %ecx
-; AVX2-NEXT:    movl %esi, %eax
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $4032, %eax # imm = 0xFC0
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX2-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %eax
-; AVX2-NEXT:    negl %eax
-; AVX2-NEXT:    movslq %eax, %rsi
-; AVX2-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1200(%rsp,%rsi), %r11
-; AVX2-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r11, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1072(%rsp,%rsi), %r12
-; AVX2-NEXT:    movq 1080(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %r12, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rdx, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1488(%rsp,%rsi), %rbp
-; AVX2-NEXT:    movq 1496(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rbp, %rax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX2-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    shldq %cl, %rax, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1360(%rsp,%rsi), %r10
-; AVX2-NEXT:    movq 1368(%rsp,%rsi), %r8
-; AVX2-NEXT:    movq %r8, %rdx
-; AVX2-NEXT:    shldq %cl, %r10, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1176(%rsp,%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rdx
-; AVX2-NEXT:    shldq %cl, %r9, %rdx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX2-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX2-NEXT:    movq %rdx, %r14
-; AVX2-NEXT:    shldq %cl, %r9, %r14
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1296(%rsp,%rsi), %r15
-; AVX2-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 1304(%rsp,%rsi), %r14
-; AVX2-NEXT:    movq %r14, %r13
-; AVX2-NEXT:    shldq %cl, %r15, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX2-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r15, %r9
-; AVX2-NEXT:    andq 384(%rdi), %r9
-; AVX2-NEXT:    andq 128(%rdi), %r14
-; AVX2-NEXT:    andq 320(%rdi), %r10
-; AVX2-NEXT:    orq %r9, %r14
-; AVX2-NEXT:    movq %r14, %r15
-; AVX2-NEXT:    andq 64(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    andq 448(%rdi), %rbp
-; AVX2-NEXT:    andq 192(%rdi), %r13
-; AVX2-NEXT:    orq %rbp, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andq 288(%rdi), %r8
-; AVX2-NEXT:    andq 32(%rdi), %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 416(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    andq 160(%rdi), %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    andq 352(%rdi), %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 96(%rdi), %rax
-; AVX2-NEXT:    orq %r12, %r11
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 480(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    andq 224(%rdi), %r13
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    orq %rax, %r13
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 272(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 16(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r13
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 400(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 144(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 336(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 80(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 464(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 208(%rdi), %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r8, %r11
-; AVX2-NEXT:    orq %rax, %r11
-; AVX2-NEXT:    orq %r9, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 304(%rdi), %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 48(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 432(%rdi), %r10
-; AVX2-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 176(%rdi), %rax
-; AVX2-NEXT:    orq %r9, %r8
-; AVX2-NEXT:    movq %r8, %r9
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 368(%rdi), %r8
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 112(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 496(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX2-NEXT:    andq 240(%rdi), %r9
-; AVX2-NEXT:    orq %r8, %r9
-; AVX2-NEXT:    orq %rax, %r9
-; AVX2-NEXT:    orq %r10, %r9
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 392(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX2-NEXT:    andq 136(%rdi), %rbp
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 328(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 72(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rbp
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 456(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX2-NEXT:    andq 200(%rdi), %r12
-; AVX2-NEXT:    orq %rax, %r12
-; AVX2-NEXT:    orq %r8, %r12
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 296(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 40(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 424(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 168(%rdi), %rax
-; AVX2-NEXT:    orq %r10, %r8
-; AVX2-NEXT:    movq %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 360(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 104(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 488(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    andq 232(%rdi), %r14
-; AVX2-NEXT:    orq %rax, %r14
-; AVX2-NEXT:    orq %r8, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 280(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 24(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 408(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 152(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    andq 344(%rdi), %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 88(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 472(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    andq 216(%rdi), %rbx
-; AVX2-NEXT:    orq %rax, %rbx
-; AVX2-NEXT:    orq %r8, %rbx
-; AVX2-NEXT:    orq %r10, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 312(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 56(%rdi), %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 440(%rdi), %r10
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 184(%rdi), %r8
-; AVX2-NEXT:    orq %r10, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    andq 376(%rdi), %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 120(%rdi), %rax
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    movq %r8, %r11
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    andq 504(%rdi), %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    andq 248(%rdi), %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    shldq %cl, %r8, %r10
-; AVX2-NEXT:    orq %r11, %rax
-; AVX2-NEXT:    movq 1040(%rsp,%rsi), %rsi
-; AVX2-NEXT:    orq %rbx, %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    shlxq %rcx, %rsi, %rax
-; AVX2-NEXT:    andq 256(%rdi), %r10
-; AVX2-NEXT:    andq (%rdi), %rax
-; AVX2-NEXT:    orq %r10, %rax
-; AVX2-NEXT:    orq %r15, %rax
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r13, %rax
-; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX2-NEXT:    shldq %cl, %rsi, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    andq 264(%rdi), %rcx
-; AVX2-NEXT:    andq 8(%rdi), %rdx
-; AVX2-NEXT:    orq %r9, %rax
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    orq %r12, %rdx
-; AVX2-NEXT:    orq %r14, %rdx
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    setne %al
-; AVX2-NEXT:    addq $1560, %rsp # imm = 0x618
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: test_ne_i4096:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $1560, %rsp # imm = 0x618
-; AVX512-NEXT:    movl %esi, %ecx
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    andl $4032, %eax # imm = 0xFC0
-; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512-NEXT:    andl $63, %ecx
-; AVX512-NEXT:    shrl $3, %eax
-; AVX512-NEXT:    negl %eax
-; AVX512-NEXT:    movslq %eax, %rsi
-; AVX512-NEXT:    movq 1280(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1288(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1536(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1544(%rsp,%rsi), %rax
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1152(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1160(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1408(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1416(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1216(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    movq 1224(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1472(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1480(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1088(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1096(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1344(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1352(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1248(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1256(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1504(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1512(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1120(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1128(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1376(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1384(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1184(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1192(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1440(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1448(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1056(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1064(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1312(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1320(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1264(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1272(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1520(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1528(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1136(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1144(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1392(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1400(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1200(%rsp,%rsi), %r10
-; AVX512-NEXT:    movq 1208(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r10, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1456(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1464(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1072(%rsp,%rsi), %r14
-; AVX512-NEXT:    movq 1080(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r14, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1328(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1336(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rdx, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1232(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1240(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1488(%rsp,%rsi), %r12
-; AVX512-NEXT:    movq 1496(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %r12, %rax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1104(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq 1112(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1360(%rsp,%rsi), %r11
-; AVX512-NEXT:    movq 1368(%rsp,%rsi), %rbx
-; AVX512-NEXT:    movq %rbx, %rdx
-; AVX512-NEXT:    shldq %cl, %r11, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1168(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1176(%rsp,%rsi), %r8
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    shldq %cl, %r9, %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1424(%rsp,%rsi), %r9
-; AVX512-NEXT:    movq 1432(%rsp,%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, %r15
-; AVX512-NEXT:    shldq %cl, %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1296(%rsp,%rsi), %rbp
-; AVX512-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 1304(%rsp,%rsi), %r15
-; AVX512-NEXT:    movq %r15, %r13
-; AVX512-NEXT:    shldq %cl, %rbp, %r13
-; AVX512-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq 1048(%rsp,%rsi), %rdx
-; AVX512-NEXT:    shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r13
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %r15, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rbp, %r9
-; AVX512-NEXT:    andq 384(%rdi), %r9
-; AVX512-NEXT:    andq 128(%rdi), %r15
-; AVX512-NEXT:    orq %r9, %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andq 320(%rdi), %r11
-; AVX512-NEXT:    andq 64(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %rax
-; AVX512-NEXT:    andq 448(%rdi), %r12
-; AVX512-NEXT:    andq 192(%rdi), %r13
-; AVX512-NEXT:    orq %r12, %r13
-; AVX512-NEXT:    orq %rax, %r13
-; AVX512-NEXT:    andq 288(%rdi), %r8
-; AVX512-NEXT:    andq 32(%rdi), %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 416(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    andq 160(%rdi), %r10
-; AVX512-NEXT:    orq %rax, %r10
-; AVX512-NEXT:    andq 352(%rdi), %rbx
-; AVX512-NEXT:    orq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 96(%rdi), %rax
-; AVX512-NEXT:    orq %rbx, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 480(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    andq 224(%rdi), %r15
-; AVX512-NEXT:    orq %rax, %r15
-; AVX512-NEXT:    orq %r8, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 272(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 16(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 400(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 144(%rdi), %rax
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 336(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 80(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 464(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 208(%rdi), %r11
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    orq %r8, %r11
-; AVX512-NEXT:    orq %rax, %r11
-; AVX512-NEXT:    orq %r9, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 304(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 48(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 432(%rdi), %r9
-; AVX512-NEXT:    movq (%rsp), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 176(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    orq %r9, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 368(%rdi), %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 112(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    movq %r8, %r10
-; AVX512-NEXT:    orq %r9, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 496(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; AVX512-NEXT:    andq 240(%rdi), %r9
-; AVX512-NEXT:    orq %r8, %r9
-; AVX512-NEXT:    orq %rax, %r9
-; AVX512-NEXT:    orq %r10, %r9
-; AVX512-NEXT:    orq %r11, %r9
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 392(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
-; AVX512-NEXT:    andq 136(%rdi), %rbp
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 328(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 72(%rdi), %rax
-; AVX512-NEXT:    orq %r10, %rbp
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 456(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    andq 200(%rdi), %r12
-; AVX512-NEXT:    orq %rax, %r12
-; AVX512-NEXT:    orq %r8, %r12
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 296(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 40(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 424(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 168(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 360(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 104(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 488(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    andq 232(%rdi), %r14
-; AVX512-NEXT:    orq %rax, %r14
-; AVX512-NEXT:    orq %r8, %r14
-; AVX512-NEXT:    orq %r10, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 280(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 24(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 408(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 152(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    andq 344(%rdi), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 88(%rdi), %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 472(%rdi), %rax
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    andq 216(%rdi), %rbx
-; AVX512-NEXT:    orq %rax, %rbx
-; AVX512-NEXT:    orq %r8, %rbx
-; AVX512-NEXT:    orq %r10, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    andq 312(%rdi), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 56(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 440(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 184(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 376(%rdi), %r8
-; AVX512-NEXT:    orq %r10, %rax
-; AVX512-NEXT:    movq %rax, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 120(%rdi), %rax
-; AVX512-NEXT:    orq %r8, %rax
-; AVX512-NEXT:    movq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 504(%rdi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    andq 248(%rdi), %r8
-; AVX512-NEXT:    orq %rax, %r8
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    orq %r11, %r8
-; AVX512-NEXT:    movq 1040(%rsp,%rsi), %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    shldq %cl, %rsi, %r10
-; AVX512-NEXT:    orq %rbx, %r8
-; AVX512-NEXT:    shlxq %rcx, %rax, %rsi
-; AVX512-NEXT:    andq 256(%rdi), %r10
-; AVX512-NEXT:    andq (%rdi), %rsi
-; AVX512-NEXT:    orq %r10, %rsi
-; AVX512-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512-NEXT:    orq %r13, %rsi
-; AVX512-NEXT:    orq %r15, %rsi
-; AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512-NEXT:    shldq %cl, %rax, %rdx
-; AVX512-NEXT:    orq %r9, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    andq 264(%rdi), %rax
-; AVX512-NEXT:    andq 8(%rdi), %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    orq %rbp, %rdx
-; AVX512-NEXT:    orq %r12, %rdx
-; AVX512-NEXT:    orq %r14, %rdx
-; AVX512-NEXT:    orq %r8, %rdx
-; AVX512-NEXT:    orq %rsi, %rdx
-; AVX512-NEXT:    setne %al
-; AVX512-NEXT:    addq $1560, %rsp # imm = 0x618
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; X64-LABEL: test_ne_i4096:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andl $4064, %eax # imm = 0xFE0
+; X64-NEXT:    shrl $3, %eax
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    btl %esi, %eax
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
   %rem = and i32 %position, 4095
   %ofs = zext nneg i32 %rem to i4096
   %bit = shl nuw i4096 1, %ofs
@@ -7155,14 +1029,73 @@ define i1 @complement_cmpz_i128(ptr %word, i32 %position) nounwind {
 define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-LABEL: reset_multiload_i128:
 ; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $96, %esi
+; X86-NEXT:    shrl $3, %esi
+; X86-NEXT:    movl (%ecx,%esi), %edi
+; X86-NEXT:    movl %edi, %ebx
+; X86-NEXT:    btrl %edx, %ebx
+; X86-NEXT:    btl %edx, %edi
+; X86-NEXT:    movl %ebx, (%ecx,%esi)
+; X86-NEXT:    jae .LBB22_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: reset_multiload_i128:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    andl $96, %ecx
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    movl (%rdi,%rcx), %r9d
+; X64-NEXT:    movl %r9d, %r8d
+; X64-NEXT:    btrl %esi, %r8d
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    btl %esi, %r9d
+; X64-NEXT:    jb .LBB22_2
+; X64-NEXT:  # %bb.1:
+; X64-NEXT:    movl (%rdx), %eax
+; X64-NEXT:  .LBB22_2:
+; X64-NEXT:    movl %r8d, (%rdi,%rcx)
+; X64-NEXT:    retq
+  %rem = and i32 %position, 127
+  %ofs = zext nneg i32 %rem to i128
+  %bit = shl nuw i128 1, %ofs
+  %mask = xor i128 %bit, -1
+  %ld = load i128, ptr %word
+  %sel = load i32, ptr %p
+  %test = and i128 %ld, %bit
+  %res = and i128 %ld, %mask
+  %cmp = icmp eq i128 %test, 0
+  store i128 %res, ptr %word
+  %ret = select i1 %cmp, i32 %sel, i32 0
+  ret i32 %ret
+}
+
+; BTC/BT/BTS sequence on same i128
+define i1 @sequence_i128(ptr %word, i32 %pos0, i32 %pos1, i32 %pos2) nounwind {
+; X86-LABEL: sequence_i128:
+; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebp
 ; X86-NEXT:    movl %esp, %ebp
 ; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    andl $-16, %esp
-; X86-NEXT:    subl $80, %esp
-; X86-NEXT:    movzbl 12(%ebp), %ecx
+; X86-NEXT:    subl $144, %esp
+; X86-NEXT:    movb 20(%ebp), %ch
+; X86-NEXT:    movb 12(%ebp), %cl
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -7176,54 +1109,80 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    andb $12, %al
 ; X86-NEXT:    negb %al
 ; X86-NEXT:    movsbl %al, %eax
-; X86-NEXT:    movl 56(%esp,%eax), %esi
-; X86-NEXT:    movl 60(%esp,%eax), %edx
-; X86-NEXT:    shldl %cl, %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 48(%esp,%eax), %edi
-; X86-NEXT:    movl 52(%esp,%eax), %eax
-; X86-NEXT:    shldl %cl, %eax, %esi
+; X86-NEXT:    movl 56(%esp,%eax), %edx
+; X86-NEXT:    movl 60(%esp,%eax), %esi
+; X86-NEXT:    shldl %cl, %edx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl %cl, %edi, %eax
-; X86-NEXT:    movl 8(%ebp), %ebx
+; X86-NEXT:    movl 48(%esp,%eax), %edi
+; X86-NEXT:    movl 52(%esp,%eax), %ebx
+; X86-NEXT:    shldl %cl, %ebx, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edi, %ebx
 ; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    movl 8(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %esi, %ecx
-; X86-NEXT:    movl (%ebx), %esi
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    andb $12, %al
+; X86-NEXT:    negb %al
+; X86-NEXT:    movsbl %al, %eax
+; X86-NEXT:    movl 84(%esp,%eax), %edx
+; X86-NEXT:    movl 88(%esp,%eax), %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %edi, %esi
-; X86-NEXT:    orl %ecx, %esi
-; X86-NEXT:    movl 12(%ebx), %ecx
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl 4(%ebx), %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    andl %ebx, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    notl %ecx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT:    notl %edx
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT:    notl %edi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movzbl 20(%ebp), %ecx
+; X86-NEXT:    shldl %cl, %edx, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 80(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 92(%esp,%eax), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %esi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    xorl 8(%eax), %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    xorl 12(%eax), %esi
+; X86-NEXT:    xorl (%eax), %edi
+; X86-NEXT:    xorl 4(%eax), %ebx
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl 16(%ebp), %eax
-; X86-NEXT:    movl (%eax), %eax
-; X86-NEXT:    movl 8(%ebp), %esi
-; X86-NEXT:    movl %ebx, 8(%esi)
-; X86-NEXT:    movl %ecx, 12(%esi)
-; X86-NEXT:    movl %edi, (%esi)
-; X86-NEXT:    movl %edx, 4(%esi)
-; X86-NEXT:    je .LBB22_2
-; X86-NEXT:  # %bb.1:
-; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:  .LBB22_2:
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    andb $96, %al
+; X86-NEXT:    shrb $3, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl 96(%esp,%eax), %eax
+; X86-NEXT:    movl 16(%ebp), %ecx
+; X86-NEXT:    btl %ecx, %eax
+; X86-NEXT:    setae %al
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movl %edx, 8(%ecx)
+; X86-NEXT:    movl %esi, 12(%ecx)
+; X86-NEXT:    movl %edi, (%ecx)
+; X86-NEXT:    movl %ebx, 4(%ecx)
 ; X86-NEXT:    leal -12(%ebp), %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi
@@ -7231,73 +1190,129 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: reset_multiload_i128:
+; SSE-LABEL: sequence_i128:
 ; SSE:       # %bb.0:
+; SSE-NEXT:    movl %ecx, %eax
 ; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    movl $1, %esi
-; SSE-NEXT:    xorl %r8d, %r8d
-; SSE-NEXT:    shldq %cl, %rsi, %r8
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    shlq %cl, %rsi
+; SSE-NEXT:    movl $1, %r8d
+; SSE-NEXT:    xorl %esi, %esi
+; SSE-NEXT:    shldq %cl, %r8, %rsi
+; SSE-NEXT:    movl $1, %r9d
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    xorl %r11d, %r11d
 ; SSE-NEXT:    testb $64, %cl
-; SSE-NEXT:    cmovneq %rsi, %r8
-; SSE-NEXT:    cmovneq %rax, %rsi
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    andq %r8, %r10
-; SSE-NEXT:    notq %r8
-; SSE-NEXT:    movq %rcx, %r11
-; SSE-NEXT:    andq %rsi, %r11
-; SSE-NEXT:    notq %rsi
-; SSE-NEXT:    andq %r9, %r8
-; SSE-NEXT:    andq %rcx, %rsi
-; SSE-NEXT:    orq %r10, %r11
-; SSE-NEXT:    jne .LBB22_2
-; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movl (%rdx), %eax
-; SSE-NEXT:  .LBB22_2:
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    movq %r8, 8(%rdi)
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE-NEXT:    cmovneq %r9, %rsi
+; SSE-NEXT:    cmovneq %r11, %r9
+; SSE-NEXT:    xorl %r10d, %r10d
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    shldq %cl, %r8, %r10
+; SSE-NEXT:    shlq %cl, %r8
+; SSE-NEXT:    testb $64, %al
+; SSE-NEXT:    cmovneq %r8, %r10
+; SSE-NEXT:    cmovneq %r11, %r8
+; SSE-NEXT:    xorq 8(%rdi), %rsi
+; SSE-NEXT:    xorq (%rdi), %r9
+; SSE-NEXT:    movl %edx, %ecx
+; SSE-NEXT:    andb $32, %cl
+; SSE-NEXT:    movq %r9, %rax
+; SSE-NEXT:    shrdq %cl, %rsi, %rax
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    shrq %cl, %r11
+; SSE-NEXT:    testb $64, %dl
+; SSE-NEXT:    cmoveq %rax, %r11
+; SSE-NEXT:    btl %edx, %r11d
+; SSE-NEXT:    setae %al
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: reset_multiload_i128:
-; AVX:       # %bb.0:
-; AVX-NEXT:    movl %esi, %ecx
-; AVX-NEXT:    movl $1, %esi
-; AVX-NEXT:    xorl %r8d, %r8d
-; AVX-NEXT:    shldq %cl, %rsi, %r8
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    shlxq %rcx, %rsi, %r9
-; AVX-NEXT:    testb $64, %cl
-; AVX-NEXT:    cmovneq %r9, %r8
-; AVX-NEXT:    cmovneq %rax, %r9
-; AVX-NEXT:    movq (%rdi), %r10
-; AVX-NEXT:    movq 8(%rdi), %r11
-; AVX-NEXT:    andnq %r11, %r8, %rcx
-; AVX-NEXT:    andq %r8, %r11
-; AVX-NEXT:    andnq %r10, %r9, %rsi
-; AVX-NEXT:    andq %r9, %r10
-; AVX-NEXT:    orq %r11, %r10
-; AVX-NEXT:    jne .LBB22_2
-; AVX-NEXT:  # %bb.1:
-; AVX-NEXT:    movl (%rdx), %eax
-; AVX-NEXT:  .LBB22_2:
-; AVX-NEXT:    movq %rsi, (%rdi)
-; AVX-NEXT:    movq %rcx, 8(%rdi)
-; AVX-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX-NEXT:    retq
-  %rem = and i32 %position, 127
-  %ofs = zext nneg i32 %rem to i128
-  %bit = shl nuw i128 1, %ofs
-  %mask = xor i128 %bit, -1
+; AVX2-LABEL: sequence_i128:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    movl $1, %r10d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    shldq %cl, %r10, %rsi
+; AVX2-NEXT:    shlxq %rcx, %r10, %r8
+; AVX2-NEXT:    testb $64, %cl
+; AVX2-NEXT:    cmovneq %r8, %rsi
+; AVX2-NEXT:    cmovneq %r9, %r8
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    shlxq %rax, %r10, %r10
+; AVX2-NEXT:    testb $64, %al
+; AVX2-NEXT:    cmovneq %r10, %r11
+; AVX2-NEXT:    cmovneq %r9, %r10
+; AVX2-NEXT:    xorq 8(%rdi), %rsi
+; AVX2-NEXT:    xorq (%rdi), %r8
+; AVX2-NEXT:    movl %edx, %ecx
+; AVX2-NEXT:    andb $32, %cl
+; AVX2-NEXT:    movq %r8, %rax
+; AVX2-NEXT:    shrdq %cl, %rsi, %rax
+; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    testb $64, %dl
+; AVX2-NEXT:    cmoveq %rax, %rcx
+; AVX2-NEXT:    btl %edx, %ecx
+; AVX2-NEXT:    setae %al
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: sequence_i128:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movl %ecx, %eax
+; AVX512-NEXT:    movl %esi, %ecx
+; AVX512-NEXT:    movl $1, %r9d
+; AVX512-NEXT:    xorl %esi, %esi
+; AVX512-NEXT:    shldq %cl, %r9, %rsi
+; AVX512-NEXT:    xorl %r10d, %r10d
+; AVX512-NEXT:    shlxq %rcx, %r9, %r8
+; AVX512-NEXT:    testb $64, %cl
+; AVX512-NEXT:    cmovneq %r8, %rsi
+; AVX512-NEXT:    cmovneq %r10, %r8
+; AVX512-NEXT:    xorl %r11d, %r11d
+; AVX512-NEXT:    movl %eax, %ecx
+; AVX512-NEXT:    shldq %cl, %r9, %r11
+; AVX512-NEXT:    shlxq %rax, %r9, %r9
+; AVX512-NEXT:    testb $64, %al
+; AVX512-NEXT:    cmovneq %r9, %r11
+; AVX512-NEXT:    cmovneq %r10, %r9
+; AVX512-NEXT:    xorq 8(%rdi), %rsi
+; AVX512-NEXT:    xorq (%rdi), %r8
+; AVX512-NEXT:    movl %edx, %ecx
+; AVX512-NEXT:    andb $32, %cl
+; AVX512-NEXT:    movq %r8, %rax
+; AVX512-NEXT:    shrdq %cl, %rsi, %rax
+; AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512-NEXT:    testb $64, %dl
+; AVX512-NEXT:    cmoveq %rax, %rcx
+; AVX512-NEXT:    btl %edx, %ecx
+; AVX512-NEXT:    setae %al
+; AVX512-NEXT:    orq %r11, %rsi
+; AVX512-NEXT:    orq %r9, %r8
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq %rsi, 8(%rdi)
+; AVX512-NEXT:    retq
+  %rem0 = and i32 %pos0, 127
+  %rem1 = and i32 %pos1, 127
+  %rem2 = and i32 %pos2, 127
+  %ofs0 = zext nneg i32 %rem0 to i128
+  %ofs1 = zext nneg i32 %rem1 to i128
+  %ofs2 = zext nneg i32 %rem2 to i128
+  %bit0 = shl nuw i128 1, %ofs0
+  %bit1 = shl nuw i128 1, %ofs1
+  %bit2 = shl nuw i128 1, %ofs2
   %ld = load i128, ptr %word
-  %sel = load i32, ptr %p
-  %test = and i128 %ld, %bit
-  %res = and i128 %ld, %mask
-  %cmp = icmp eq i128 %test, 0
-  store i128 %res, ptr %word
-  %ret = select i1 %cmp, i32 %sel, i32 0
-  ret i32 %ret
+  %res0 = xor i128 %ld, %bit0
+  %test1 = and i128 %res0, %bit1
+  %cmp1 = icmp eq i128 %test1, 0
+  %res2 = or i128 %res0, %bit2
+  store i128 %res2, ptr %word
+  ret i1 %cmp1
 }