; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2

; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+m \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV32
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=sifive-p670 \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64P670
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=spacemit-x60 \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64X60
; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m \
; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=RV64


; test1
define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_dst_stride, ptr nocapture noundef readonly %src1, i32 noundef signext %i_src1_stride, ptr nocapture noundef readonly %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) {
; RV32-LABEL: test1:
; RV32:       # %bb.0: # %entry
; RV32-NEXT:    blez a7, .LBB0_17
; RV32-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
; RV32-NEXT:    blez a6, .LBB0_17
; RV32-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    addi t3, a7, -1
; RV32-NEXT:    csrr t2, vlenb
; RV32-NEXT:    slli t1, t2, 1
; RV32-NEXT:    li t4, 32
; RV32-NEXT:    mv t0, t1
; RV32-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    li t0, 32
; RV32-NEXT:  # %bb.4: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    addi sp, sp, -16
; RV32-NEXT:    .cfi_def_cfa_offset 16
; RV32-NEXT:    sw s0, 12(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s1, 8(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s2, 4(sp) # 4-byte Folded Spill
; RV32-NEXT:    sw s3, 0(sp) # 4-byte Folded Spill
; RV32-NEXT:    .cfi_offset s0, -4
; RV32-NEXT:    .cfi_offset s1, -8
; RV32-NEXT:    .cfi_offset s2, -12
; RV32-NEXT:    .cfi_offset s3, -16
; RV32-NEXT:    .cfi_remember_state
; RV32-NEXT:    mul t5, a1, t3
; RV32-NEXT:    add s0, a0, a6
; RV32-NEXT:    mul t6, a3, t3
; RV32-NEXT:    add s2, a2, a6
; RV32-NEXT:    mul s1, a5, t3
; RV32-NEXT:    add s3, a4, a6
; RV32-NEXT:    bltu t4, t1, .LBB0_6
; RV32-NEXT:  # %bb.5: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    li t1, 32
; RV32-NEXT:  .LBB0_6: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    add t3, s0, t5
; RV32-NEXT:    add t6, s2, t6
; RV32-NEXT:    add t4, s3, s1
; RV32-NEXT:    j .LBB0_8
; RV32-NEXT:  # %bb.7: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    mv t1, t0
; RV32-NEXT:  .LBB0_8: # %for.cond1.preheader.us.preheader
; RV32-NEXT:    .cfi_restore_state
; RV32-NEXT:    li t0, 0
; RV32-NEXT:    sltu t5, a0, t6
; RV32-NEXT:    sltu t6, a2, t3
; RV32-NEXT:    and t5, t5, t6
; RV32-NEXT:    sltu t4, a0, t4
; RV32-NEXT:    sltu t3, a4, t3
; RV32-NEXT:    and t3, t4, t3
; RV32-NEXT:    or t4, a1, a3
; RV32-NEXT:    srli t4, t4, 31
; RV32-NEXT:    or t4, t5, t4
; RV32-NEXT:    or t5, a1, a5
; RV32-NEXT:    sltu t1, a6, t1
; RV32-NEXT:    srli t5, t5, 31
; RV32-NEXT:    or t3, t3, t5
; RV32-NEXT:    or t3, t4, t3
; RV32-NEXT:    or t1, t1, t3
; RV32-NEXT:    andi t1, t1, 1
; RV32-NEXT:    slli t2, t2, 1
; RV32-NEXT:    csrwi vxrm, 0
; RV32-NEXT:    j .LBB0_10
; RV32-NEXT:  .LBB0_9: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
; RV32-NEXT:    add a0, a0, a1
; RV32-NEXT:    add a2, a2, a3
; RV32-NEXT:    addi t0, t0, 1
; RV32-NEXT:    add a4, a4, a5
; RV32-NEXT:    beq t0, a7, .LBB0_16
; RV32-NEXT:  .LBB0_10: # %for.cond1.preheader.us
; RV32-NEXT:    # =>This Loop Header: Depth=1
; RV32-NEXT:    # Child Loop BB0_13 Depth 2
; RV32-NEXT:    # Child Loop BB0_15 Depth 2
; RV32-NEXT:    beqz t1, .LBB0_12
; RV32-NEXT:  # %bb.11: # in Loop: Header=BB0_10 Depth=1
; RV32-NEXT:    li t4, 0
; RV32-NEXT:    li t3, 0
; RV32-NEXT:    j .LBB0_15
; RV32-NEXT:  .LBB0_12: # %vector.ph
; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
; RV32-NEXT:    li t3, 0
; RV32-NEXT:    neg t4, t2
; RV32-NEXT:    and t4, t4, a6
; RV32-NEXT:    li t6, 0
; RV32-NEXT:    li t5, 0
; RV32-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
; RV32-NEXT:  .LBB0_13: # %vector.body
; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
; RV32-NEXT:    # => This Inner Loop Header: Depth=2
; RV32-NEXT:    add s0, a2, t6
; RV32-NEXT:    add s1, a4, t6
; RV32-NEXT:    vl2r.v v8, (s0)
; RV32-NEXT:    add s0, a0, t6
; RV32-NEXT:    vl2r.v v10, (s1)
; RV32-NEXT:    add s1, t6, t2
; RV32-NEXT:    sltu t6, s1, t6
; RV32-NEXT:    add t5, t5, t6
; RV32-NEXT:    xor t6, s1, t4
; RV32-NEXT:    vaaddu.vv v8, v8, v10
; RV32-NEXT:    or s2, t6, t5
; RV32-NEXT:    vs2r.v v8, (s0)
; RV32-NEXT:    mv t6, s1
; RV32-NEXT:    bnez s2, .LBB0_13
; RV32-NEXT:  # %bb.14: # %middle.block
; RV32-NEXT:    # in Loop: Header=BB0_10 Depth=1
; RV32-NEXT:    beq t4, a6, .LBB0_9
; RV32-NEXT:  .LBB0_15: # %for.body4.us
; RV32-NEXT:    # Parent Loop BB0_10 Depth=1
; RV32-NEXT:    # => This Inner Loop Header: Depth=2
; RV32-NEXT:    add t5, a2, t4
; RV32-NEXT:    add t6, a4, t4
; RV32-NEXT:    add s0, a0, t4
; RV32-NEXT:    lbu t5, 0(t5)
; RV32-NEXT:    lbu t6, 0(t6)
; RV32-NEXT:    addi t4, t4, 1
; RV32-NEXT:    seqz s1, t4
; RV32-NEXT:    add t3, t3, s1
; RV32-NEXT:    add t5, t5, t6
; RV32-NEXT:    xor t6, t4, a6
; RV32-NEXT:    addi t5, t5, 1
; RV32-NEXT:    srli t5, t5, 1
; RV32-NEXT:    or t6, t6, t3
; RV32-NEXT:    sb t5, 0(s0)
; RV32-NEXT:    bnez t6, .LBB0_15
; RV32-NEXT:    j .LBB0_9
; RV32-NEXT:  .LBB0_16:
; RV32-NEXT:    lw s0, 12(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s1, 8(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s2, 4(sp) # 4-byte Folded Reload
; RV32-NEXT:    lw s3, 0(sp) # 4-byte Folded Reload
; RV32-NEXT:    .cfi_restore s0
; RV32-NEXT:    .cfi_restore s1
; RV32-NEXT:    .cfi_restore s2
; RV32-NEXT:    .cfi_restore s3
; RV32-NEXT:    addi sp, sp, 16
; RV32-NEXT:    .cfi_def_cfa_offset 0
; RV32-NEXT:  .LBB0_17: # %for.cond.cleanup
; RV32-NEXT:    ret
;
; RV64P670-LABEL: test1:
; RV64P670:       # %bb.0: # %entry
; RV64P670-NEXT:    csrwi vxrm, 0
; RV64P670-NEXT:    blez a7, .LBB0_12
; RV64P670-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
; RV64P670-NEXT:    blez a6, .LBB0_12
; RV64P670-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
; RV64P670-NEXT:    addi sp, sp, -48
; RV64P670-NEXT:    .cfi_def_cfa_offset 48
; RV64P670-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
; RV64P670-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
; RV64P670-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
; RV64P670-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
; RV64P670-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
; RV64P670-NEXT:    .cfi_offset s0, -8
; RV64P670-NEXT:    .cfi_offset s1, -16
; RV64P670-NEXT:    .cfi_offset s2, -24
; RV64P670-NEXT:    .cfi_offset s3, -32
; RV64P670-NEXT:    .cfi_offset s4, -40
; RV64P670-NEXT:    addi s1, a7, -1
; RV64P670-NEXT:    add s0, a0, a6
; RV64P670-NEXT:    li t0, 0
; RV64P670-NEXT:    li t1, 0
; RV64P670-NEXT:    zext.w s1, s1
; RV64P670-NEXT:    mul t2, a1, s1
; RV64P670-NEXT:    add t4, s0, t2
; RV64P670-NEXT:    mul t2, a3, s1
; RV64P670-NEXT:    add s0, a2, a6
; RV64P670-NEXT:    mul s1, a5, s1
; RV64P670-NEXT:    add t3, s0, t2
; RV64P670-NEXT:    add s0, a4, a6
; RV64P670-NEXT:    csrr t2, vlenb
; RV64P670-NEXT:    add t5, s0, s1
; RV64P670-NEXT:    sltu s1, a0, t3
; RV64P670-NEXT:    sltu s0, a2, t4
; RV64P670-NEXT:    slli t3, t2, 1
; RV64P670-NEXT:    and s0, s0, s1
; RV64P670-NEXT:    or s1, a1, a3
; RV64P670-NEXT:    srli s1, s1, 63
; RV64P670-NEXT:    or t6, s0, s1
; RV64P670-NEXT:    sltu s1, a0, t5
; RV64P670-NEXT:    sltu s0, a4, t4
; RV64P670-NEXT:    add t4, a0, a6
; RV64P670-NEXT:    and s0, s0, s1
; RV64P670-NEXT:    or s1, a1, a5
; RV64P670-NEXT:    srli s1, s1, 63
; RV64P670-NEXT:    or s0, s0, s1
; RV64P670-NEXT:    li s1, 32
; RV64P670-NEXT:    maxu s1, t3, s1
; RV64P670-NEXT:    or s0, t6, s0
; RV64P670-NEXT:    sltu s1, a6, s1
; RV64P670-NEXT:    or s0, s0, s1
; RV64P670-NEXT:    andi t5, s0, 1
; RV64P670-NEXT:    j .LBB0_4
; RV64P670-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT:    add a0, a0, a1
; RV64P670-NEXT:    add a2, a2, a3
; RV64P670-NEXT:    add a4, a4, a5
; RV64P670-NEXT:    addiw t1, t1, 1
; RV64P670-NEXT:    addi t0, t0, 1
; RV64P670-NEXT:    beq t1, a7, .LBB0_11
; RV64P670-NEXT:  .LBB0_4: # %for.cond1.preheader.us
; RV64P670-NEXT:    # =>This Loop Header: Depth=1
; RV64P670-NEXT:    # Child Loop BB0_7 Depth 2
; RV64P670-NEXT:    # Child Loop BB0_10 Depth 2
; RV64P670-NEXT:    beqz t5, .LBB0_6
; RV64P670-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT:    li t6, 0
; RV64P670-NEXT:    j .LBB0_9
; RV64P670-NEXT:  .LBB0_6: # %vector.ph
; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT:    slli s1, t2, 28
; RV64P670-NEXT:    mv s2, a2
; RV64P670-NEXT:    mv s3, a4
; RV64P670-NEXT:    mv s4, a0
; RV64P670-NEXT:    sub s1, s1, t3
; RV64P670-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
; RV64P670-NEXT:    and t6, s1, a6
; RV64P670-NEXT:    mv s1, t6
; RV64P670-NEXT:  .LBB0_7: # %vector.body
; RV64P670-NEXT:    # Parent Loop BB0_4 Depth=1
; RV64P670-NEXT:    # => This Inner Loop Header: Depth=2
; RV64P670-NEXT:    vl2r.v v8, (s2)
; RV64P670-NEXT:    sub s1, s1, t3
; RV64P670-NEXT:    add s2, s2, t3
; RV64P670-NEXT:    vl2r.v v10, (s3)
; RV64P670-NEXT:    add s3, s3, t3
; RV64P670-NEXT:    vaaddu.vv v8, v8, v10
; RV64P670-NEXT:    vs2r.v v8, (s4)
; RV64P670-NEXT:    add s4, s4, t3
; RV64P670-NEXT:    bnez s1, .LBB0_7
; RV64P670-NEXT:  # %bb.8: # %middle.block
; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT:    beq t6, a6, .LBB0_3
; RV64P670-NEXT:  .LBB0_9: # %for.body4.us.preheader
; RV64P670-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64P670-NEXT:    mul s2, a1, t0
; RV64P670-NEXT:    add s1, a0, t6
; RV64P670-NEXT:    add s4, a4, t6
; RV64P670-NEXT:    add t6, t6, a2
; RV64P670-NEXT:    add s2, s2, t4
; RV64P670-NEXT:  .LBB0_10: # %for.body4.us
; RV64P670-NEXT:    # Parent Loop BB0_4 Depth=1
; RV64P670-NEXT:    # => This Inner Loop Header: Depth=2
; RV64P670-NEXT:    lbu s3, 0(t6)
; RV64P670-NEXT:    lbu s0, 0(s4)
; RV64P670-NEXT:    addi s4, s4, 1
; RV64P670-NEXT:    addi t6, t6, 1
; RV64P670-NEXT:    add s0, s0, s3
; RV64P670-NEXT:    addi s0, s0, 1
; RV64P670-NEXT:    srli s0, s0, 1
; RV64P670-NEXT:    sb s0, 0(s1)
; RV64P670-NEXT:    addi s1, s1, 1
; RV64P670-NEXT:    bne s1, s2, .LBB0_10
; RV64P670-NEXT:    j .LBB0_3
; RV64P670-NEXT:  .LBB0_11:
; RV64P670-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
; RV64P670-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
; RV64P670-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
; RV64P670-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
; RV64P670-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
; RV64P670-NEXT:    .cfi_restore s0
; RV64P670-NEXT:    .cfi_restore s1
; RV64P670-NEXT:    .cfi_restore s2
; RV64P670-NEXT:    .cfi_restore s3
; RV64P670-NEXT:    .cfi_restore s4
; RV64P670-NEXT:    addi sp, sp, 48
; RV64P670-NEXT:    .cfi_def_cfa_offset 0
; RV64P670-NEXT:  .LBB0_12: # %for.cond.cleanup
; RV64P670-NEXT:    ret
;
; RV64X60-LABEL: test1:
; RV64X60:       # %bb.0: # %entry
; RV64X60-NEXT:    csrwi vxrm, 0
; RV64X60-NEXT:    blez a7, .LBB0_12
; RV64X60-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
; RV64X60-NEXT:    blez a6, .LBB0_12
; RV64X60-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
; RV64X60-NEXT:    addi sp, sp, -48
; RV64X60-NEXT:    .cfi_def_cfa_offset 48
; RV64X60-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
; RV64X60-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
; RV64X60-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
; RV64X60-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
; RV64X60-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
; RV64X60-NEXT:    .cfi_offset s0, -8
; RV64X60-NEXT:    .cfi_offset s1, -16
; RV64X60-NEXT:    .cfi_offset s2, -24
; RV64X60-NEXT:    .cfi_offset s3, -32
; RV64X60-NEXT:    .cfi_offset s4, -40
; RV64X60-NEXT:    li t0, 0
; RV64X60-NEXT:    li t1, 0
; RV64X60-NEXT:    addi s1, a7, -1
; RV64X60-NEXT:    zext.w s1, s1
; RV64X60-NEXT:    mul t3, a1, s1
; RV64X60-NEXT:    mul t4, a3, s1
; RV64X60-NEXT:    mul t5, a5, s1
; RV64X60-NEXT:    add s0, a0, a6
; RV64X60-NEXT:    csrr t2, vlenb
; RV64X60-NEXT:    add s1, a2, a6
; RV64X60-NEXT:    add t3, t3, s0
; RV64X60-NEXT:    add s0, a4, a6
; RV64X60-NEXT:    add t4, t4, s1
; RV64X60-NEXT:    li t6, 32
; RV64X60-NEXT:    add t5, t5, s0
; RV64X60-NEXT:    sltu s0, a0, t4
; RV64X60-NEXT:    sltu s1, a2, t3
; RV64X60-NEXT:    and t4, s0, s1
; RV64X60-NEXT:    or s2, a1, a3
; RV64X60-NEXT:    sltu s0, a0, t5
; RV64X60-NEXT:    sltu s1, a4, t3
; RV64X60-NEXT:    srli t3, s2, 63
; RV64X60-NEXT:    and s0, s0, s1
; RV64X60-NEXT:    or s1, a1, a5
; RV64X60-NEXT:    or t4, t4, t3
; RV64X60-NEXT:    slli t3, t2, 1
; RV64X60-NEXT:    srli s1, s1, 63
; RV64X60-NEXT:    or s0, s0, s1
; RV64X60-NEXT:    maxu s1, t3, t6
; RV64X60-NEXT:    or s0, t4, s0
; RV64X60-NEXT:    sltu s1, a6, s1
; RV64X60-NEXT:    or s0, s0, s1
; RV64X60-NEXT:    add t4, a0, a6
; RV64X60-NEXT:    andi t5, s0, 1
; RV64X60-NEXT:    j .LBB0_4
; RV64X60-NEXT:  .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT:    add a0, a0, a1
; RV64X60-NEXT:    add a2, a2, a3
; RV64X60-NEXT:    addiw t1, t1, 1
; RV64X60-NEXT:    add a4, a4, a5
; RV64X60-NEXT:    addi t0, t0, 1
; RV64X60-NEXT:    beq t1, a7, .LBB0_11
; RV64X60-NEXT:  .LBB0_4: # %for.cond1.preheader.us
; RV64X60-NEXT:    # =>This Loop Header: Depth=1
; RV64X60-NEXT:    # Child Loop BB0_7 Depth 2
; RV64X60-NEXT:    # Child Loop BB0_10 Depth 2
; RV64X60-NEXT:    beqz t5, .LBB0_6
; RV64X60-NEXT:  # %bb.5: # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT:    li t6, 0
; RV64X60-NEXT:    j .LBB0_9
; RV64X60-NEXT:  .LBB0_6: # %vector.ph
; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT:    slli s1, t2, 28
; RV64X60-NEXT:    sub s1, s1, t3
; RV64X60-NEXT:    and t6, s1, a6
; RV64X60-NEXT:    mv s2, a2
; RV64X60-NEXT:    mv s3, a4
; RV64X60-NEXT:    mv s4, a0
; RV64X60-NEXT:    mv s1, t6
; RV64X60-NEXT:    vsetvli s0, zero, e8, m2, ta, ma
; RV64X60-NEXT:  .LBB0_7: # %vector.body
; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
; RV64X60-NEXT:    vl2r.v v8, (s2)
; RV64X60-NEXT:    vl2r.v v10, (s3)
; RV64X60-NEXT:    vaaddu.vv v8, v8, v10
; RV64X60-NEXT:    sub s1, s1, t3
; RV64X60-NEXT:    vs2r.v v8, (s4)
; RV64X60-NEXT:    add s4, s4, t3
; RV64X60-NEXT:    add s3, s3, t3
; RV64X60-NEXT:    add s2, s2, t3
; RV64X60-NEXT:    bnez s1, .LBB0_7
; RV64X60-NEXT:  # %bb.8: # %middle.block
; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT:    beq t6, a6, .LBB0_3
; RV64X60-NEXT:  .LBB0_9: # %for.body4.us.preheader
; RV64X60-NEXT:    # in Loop: Header=BB0_4 Depth=1
; RV64X60-NEXT:    mul s2, a1, t0
; RV64X60-NEXT:    add s0, a0, t6
; RV64X60-NEXT:    add s2, s2, t4
; RV64X60-NEXT:    add s4, a4, t6
; RV64X60-NEXT:    add t6, t6, a2
; RV64X60-NEXT:  .LBB0_10: # %for.body4.us
; RV64X60-NEXT:    # Parent Loop BB0_4 Depth=1
; RV64X60-NEXT:    # => This Inner Loop Header: Depth=2
; RV64X60-NEXT:    lbu s3, 0(t6)
; RV64X60-NEXT:    lbu s1, 0(s4)
; RV64X60-NEXT:    add s1, s1, s3
; RV64X60-NEXT:    addi s1, s1, 1
; RV64X60-NEXT:    srli s1, s1, 1
; RV64X60-NEXT:    sb s1, 0(s0)
; RV64X60-NEXT:    addi s0, s0, 1
; RV64X60-NEXT:    addi s4, s4, 1
; RV64X60-NEXT:    addi t6, t6, 1
; RV64X60-NEXT:    bne s0, s2, .LBB0_10
; RV64X60-NEXT:    j .LBB0_3
; RV64X60-NEXT:  .LBB0_11:
; RV64X60-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
; RV64X60-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
; RV64X60-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
; RV64X60-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
; RV64X60-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
; RV64X60-NEXT:    .cfi_restore s0
; RV64X60-NEXT:    .cfi_restore s1
; RV64X60-NEXT:    .cfi_restore s2
; RV64X60-NEXT:    .cfi_restore s3
; RV64X60-NEXT:    .cfi_restore s4
; RV64X60-NEXT:    addi sp, sp, 48
; RV64X60-NEXT:    .cfi_def_cfa_offset 0
; RV64X60-NEXT:  .LBB0_12: # %for.cond.cleanup
; RV64X60-NEXT:    ret
;
; RV64-LABEL: test1:
; RV64:       # %bb.0: # %entry
; RV64-NEXT:    blez a7, .LBB0_14
; RV64-NEXT:  # %bb.1: # %for.cond1.preheader.lr.ph
; RV64-NEXT:    blez a6, .LBB0_14
; RV64-NEXT:  # %bb.2: # %for.cond1.preheader.us.preheader
; RV64-NEXT:    addi sp, sp, -48
; RV64-NEXT:    .cfi_def_cfa_offset 48
; RV64-NEXT:    sd s0, 40(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s1, 32(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s2, 24(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s3, 16(sp) # 8-byte Folded Spill
; RV64-NEXT:    sd s4, 8(sp) # 8-byte Folded Spill
; RV64-NEXT:    .cfi_offset s0, -8
; RV64-NEXT:    .cfi_offset s1, -16
; RV64-NEXT:    .cfi_offset s2, -24
; RV64-NEXT:    .cfi_offset s3, -32
; RV64-NEXT:    .cfi_offset s4, -40
; RV64-NEXT:    addi t1, a7, -1
; RV64-NEXT:    add t5, a0, a6
; RV64-NEXT:    add s0, a2, a6
; RV64-NEXT:    add t6, a4, a6
; RV64-NEXT:    csrr t0, vlenb
; RV64-NEXT:    li t2, 32
; RV64-NEXT:    slli t1, t1, 32
; RV64-NEXT:    srli t3, t1, 32
; RV64-NEXT:    mul t1, a1, t3
; RV64-NEXT:    add t5, t5, t1
; RV64-NEXT:    mul t1, a3, t3
; RV64-NEXT:    add s0, s0, t1
; RV64-NEXT:    slli t1, t0, 1
; RV64-NEXT:    mul t3, a5, t3
; RV64-NEXT:    add t6, t6, t3
; RV64-NEXT:    mv t4, t1
; RV64-NEXT:    bltu t2, t1, .LBB0_4
; RV64-NEXT:  # %bb.3: # %for.cond1.preheader.us.preheader
; RV64-NEXT:    li t4, 32
; RV64-NEXT:  .LBB0_4: # %for.cond1.preheader.us.preheader
; RV64-NEXT:    li t2, 0
; RV64-NEXT:    li t3, 0
; RV64-NEXT:    sltu s0, a0, s0
; RV64-NEXT:    sltu s1, a2, t5
; RV64-NEXT:    and s0, s0, s1
; RV64-NEXT:    sltu t6, a0, t6
; RV64-NEXT:    sltu t5, a4, t5
; RV64-NEXT:    and t5, t6, t5
; RV64-NEXT:    or t6, a1, a3
; RV64-NEXT:    srli t6, t6, 63
; RV64-NEXT:    or t6, s0, t6
; RV64-NEXT:    or s0, a1, a5
; RV64-NEXT:    srli s0, s0, 63
; RV64-NEXT:    or t5, t5, s0
; RV64-NEXT:    sltu s0, a6, t4
; RV64-NEXT:    or t5, t6, t5
; RV64-NEXT:    add t4, a0, a6
; RV64-NEXT:    or t5, s0, t5
; RV64-NEXT:    andi t5, t5, 1
; RV64-NEXT:    csrwi vxrm, 0
; RV64-NEXT:    j .LBB0_6
; RV64-NEXT:  .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us
; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT:    add a0, a0, a1
; RV64-NEXT:    add a2, a2, a3
; RV64-NEXT:    add a4, a4, a5
; RV64-NEXT:    addiw t3, t3, 1
; RV64-NEXT:    addi t2, t2, 1
; RV64-NEXT:    beq t3, a7, .LBB0_13
; RV64-NEXT:  .LBB0_6: # %for.cond1.preheader.us
; RV64-NEXT:    # =>This Loop Header: Depth=1
; RV64-NEXT:    # Child Loop BB0_9 Depth 2
; RV64-NEXT:    # Child Loop BB0_12 Depth 2
; RV64-NEXT:    beqz t5, .LBB0_8
; RV64-NEXT:  # %bb.7: # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT:    li t6, 0
; RV64-NEXT:    j .LBB0_11
; RV64-NEXT:  .LBB0_8: # %vector.ph
; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT:    slli t6, t0, 28
; RV64-NEXT:    sub t6, t6, t1
; RV64-NEXT:    and t6, t6, a6
; RV64-NEXT:    mv s0, a2
; RV64-NEXT:    mv s1, a4
; RV64-NEXT:    mv s2, a0
; RV64-NEXT:    mv s3, t6
; RV64-NEXT:    vsetvli s4, zero, e8, m2, ta, ma
; RV64-NEXT:  .LBB0_9: # %vector.body
; RV64-NEXT:    # Parent Loop BB0_6 Depth=1
; RV64-NEXT:    # => This Inner Loop Header: Depth=2
; RV64-NEXT:    vl2r.v v8, (s0)
; RV64-NEXT:    vl2r.v v10, (s1)
; RV64-NEXT:    sub s3, s3, t1
; RV64-NEXT:    add s1, s1, t1
; RV64-NEXT:    vaaddu.vv v8, v8, v10
; RV64-NEXT:    vs2r.v v8, (s2)
; RV64-NEXT:    add s2, s2, t1
; RV64-NEXT:    add s0, s0, t1
; RV64-NEXT:    bnez s3, .LBB0_9
; RV64-NEXT:  # %bb.10: # %middle.block
; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT:    beq t6, a6, .LBB0_5
; RV64-NEXT:  .LBB0_11: # %for.body4.us.preheader
; RV64-NEXT:    # in Loop: Header=BB0_6 Depth=1
; RV64-NEXT:    mul s2, a1, t2
; RV64-NEXT:    add s0, a0, t6
; RV64-NEXT:    add s1, a4, t6
; RV64-NEXT:    add s2, t4, s2
; RV64-NEXT:    add t6, a2, t6
; RV64-NEXT:  .LBB0_12: # %for.body4.us
; RV64-NEXT:    # Parent Loop BB0_6 Depth=1
; RV64-NEXT:    # => This Inner Loop Header: Depth=2
; RV64-NEXT:    lbu s3, 0(t6)
; RV64-NEXT:    lbu s4, 0(s1)
; RV64-NEXT:    add s3, s3, s4
; RV64-NEXT:    addi s3, s3, 1
; RV64-NEXT:    srli s3, s3, 1
; RV64-NEXT:    sb s3, 0(s0)
; RV64-NEXT:    addi s0, s0, 1
; RV64-NEXT:    addi s1, s1, 1
; RV64-NEXT:    addi t6, t6, 1
; RV64-NEXT:    bne s0, s2, .LBB0_12
; RV64-NEXT:    j .LBB0_5
; RV64-NEXT:  .LBB0_13:
; RV64-NEXT:    ld s0, 40(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s1, 32(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s2, 24(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s3, 16(sp) # 8-byte Folded Reload
; RV64-NEXT:    ld s4, 8(sp) # 8-byte Folded Reload
; RV64-NEXT:    .cfi_restore s0
; RV64-NEXT:    .cfi_restore s1
; RV64-NEXT:    .cfi_restore s2
; RV64-NEXT:    .cfi_restore s3
; RV64-NEXT:    .cfi_restore s4
; RV64-NEXT:    addi sp, sp, 48
; RV64-NEXT:    .cfi_def_cfa_offset 0
; RV64-NEXT:  .LBB0_14: # %for.cond.cleanup
; RV64-NEXT:    ret
entry:
  %cmp29 = icmp sgt i32 %i_height, 0
  br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup

for.cond1.preheader.lr.ph:                        ; preds = %entry
  %cmp227 = icmp sgt i32 %i_width, 0
  %idx.ext = sext i32 %i_dst_stride to i64
  %idx.ext12 = sext i32 %i_src1_stride to i64
  %idx.ext14 = sext i32 %i_src2_stride to i64
  br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup

for.cond1.preheader.us.preheader:                 ; preds = %for.cond1.preheader.lr.ph
  %wide.trip.count = zext nneg i32 %i_width to i64
  %0 = add nsw i32 %i_height, -1
  %1 = zext i32 %0 to i64
  %2 = mul nsw i64 %idx.ext, %1
  %3 = getelementptr i8, ptr %dst, i64 %2
  %scevgep = getelementptr i8, ptr %3, i64 %wide.trip.count
  %4 = mul nsw i64 %idx.ext12, %1
  %5 = getelementptr i8, ptr %src1, i64 %4
  %scevgep36 = getelementptr i8, ptr %5, i64 %wide.trip.count
  %6 = mul nsw i64 %idx.ext14, %1
  %7 = getelementptr i8, ptr %src2, i64 %6
  %scevgep37 = getelementptr i8, ptr %7, i64 %wide.trip.count
  %8 = tail call i64 @llvm.vscale.i64()
  %9 = shl nuw nsw i64 %8, 4
  %10 = tail call i64 @llvm.umax.i64(i64 %9, i64 32)
  %min.iters.check = icmp ugt i64 %10, %wide.trip.count
  %bound0 = icmp ult ptr %dst, %scevgep36
  %bound1 = icmp ult ptr %src1, %scevgep
  %found.conflict = and i1 %bound0, %bound1
  %11 = or i32 %i_dst_stride, %i_src1_stride
  %12 = icmp slt i32 %11, 0
  %13 = or i1 %found.conflict, %12
  %bound039 = icmp ult ptr %dst, %scevgep37
  %bound140 = icmp ult ptr %src2, %scevgep
  %found.conflict41 = and i1 %bound039, %bound140
  %14 = or i32 %i_dst_stride, %i_src2_stride
  %15 = icmp slt i32 %14, 0
  %16 = or i1 %found.conflict41, %15
  %conflict.rdx = or i1 %13, %16
  br label %for.cond1.preheader.us

for.cond1.preheader.us:                           ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us
  %y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
  %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ]
  %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ]
  %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ]
  %brmerge = select i1 %min.iters.check, i1 true, i1 %conflict.rdx
  br i1 %brmerge, label %for.body4.us.preheader, label %vector.ph

vector.ph:                                        ; preds = %for.cond1.preheader.us
  %17 = tail call i64 @llvm.vscale.i64()
  %.neg = mul nuw nsw i64 %17, 2147483632
  %n.vec = and i64 %.neg, %wide.trip.count
  %18 = tail call i64 @llvm.vscale.i64()
  %19 = shl nuw nsw i64 %18, 4
  br label %vector.body

vector.body:                                      ; preds = %vector.body, %vector.ph
  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  %20 = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %index
  %wide.load = load <vscale x 16 x i8>, ptr %20, align 1
  %21 = zext <vscale x 16 x i8> %wide.load to <vscale x 16 x i16>
  %22 = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %index
  %wide.load44 = load <vscale x 16 x i8>, ptr %22, align 1
  %23 = zext <vscale x 16 x i8> %wide.load44 to <vscale x 16 x i16>
  %24 = add nuw nsw <vscale x 16 x i16> %21, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
  %25 = add nuw nsw <vscale x 16 x i16> %24, %23
  %26 = lshr <vscale x 16 x i16> %25, shufflevector (<vscale x 16 x i16> insertelement (<vscale x 16 x i16> poison, i16 1, i64 0), <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer)
  %27 = trunc <vscale x 16 x i16> %26 to <vscale x 16 x i8>
  %28 = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %index
  store <vscale x 16 x i8> %27, ptr %28, align 1
  %index.next = add nuw i64 %index, %19
  %29 = icmp eq i64 %index.next, %n.vec
  br i1 %29, label %middle.block, label %vector.body

middle.block:                                     ; preds = %vector.body
  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
  br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader

for.body4.us.preheader:                           ; preds = %for.cond1.preheader.us, %middle.block
  %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ]
  br label %for.body4.us

for.body4.us:                                     ; preds = %for.body4.us.preheader, %for.body4.us
  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ]
  %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv
  %30 = load i8, ptr %arrayidx.us, align 1
  %conv.us = zext i8 %30 to i16
  %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv
  %31 = load i8, ptr %arrayidx6.us, align 1
  %conv7.us = zext i8 %31 to i16
  %add.us = add nuw nsw i16 %conv.us, 1
  %add8.us = add nuw nsw i16 %add.us, %conv7.us
  %shr.us = lshr i16 %add8.us, 1
  %conv9.us = trunc nuw i16 %shr.us to i8
  %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv
  store i8 %conv9.us, ptr %arrayidx11.us, align 1
  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
  br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us

for.cond1.for.cond.cleanup3_crit_edge.us:         ; preds = %for.body4.us, %middle.block
  %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext
  %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12
  %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14
  %inc17.us = add nuw nsw i32 %y.033.us, 1
  %exitcond35.not = icmp eq i32 %inc17.us, %i_height
  br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us

for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry
  ret void
}