; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+m \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV32 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=sifive-p670 \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64P670 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m -mcpu=spacemit-x60 \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64X60 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+m \ ; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=RV64 ; test1 define void @test1(ptr nocapture noundef writeonly %dst, i32 noundef signext %i_dst_stride, ptr nocapture noundef readonly %src1, i32 noundef signext %i_src1_stride, ptr nocapture noundef readonly %src2, i32 noundef signext %i_src2_stride, i32 noundef signext %i_width, i32 noundef signext %i_height) { ; RV32-LABEL: test1: ; RV32: # %bb.0: # %entry ; RV32-NEXT: blez a7, .LBB0_17 ; RV32-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph ; RV32-NEXT: blez a6, .LBB0_17 ; RV32-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader ; RV32-NEXT: addi t3, a7, -1 ; RV32-NEXT: csrr t2, vlenb ; RV32-NEXT: slli t1, t2, 1 ; RV32-NEXT: li t4, 32 ; RV32-NEXT: mv t0, t1 ; RV32-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader ; RV32-NEXT: li t0, 32 ; RV32-NEXT: # %bb.4: # %for.cond1.preheader.us.preheader ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw s0, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s1, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s2, 4(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s3, 0(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset s0, -4 ; RV32-NEXT: .cfi_offset s1, -8 ; RV32-NEXT: .cfi_offset s2, -12 ; RV32-NEXT: .cfi_offset s3, -16 ; RV32-NEXT: .cfi_remember_state ; RV32-NEXT: mul t5, a1, t3 ; RV32-NEXT: add s0, a0, a6 ; RV32-NEXT: mul t6, a3, t3 ; RV32-NEXT: add s2, a2, a6 ; RV32-NEXT: mul s1, a5, t3 ; RV32-NEXT: add s3, a4, a6 ; RV32-NEXT: bltu t4, t1, .LBB0_6 ; RV32-NEXT: # %bb.5: # %for.cond1.preheader.us.preheader ; RV32-NEXT: li t1, 32 ; RV32-NEXT: .LBB0_6: # %for.cond1.preheader.us.preheader ; RV32-NEXT: add t3, s0, t5 ; RV32-NEXT: add t6, s2, t6 ; RV32-NEXT: add t4, s3, s1 ; RV32-NEXT: j .LBB0_8 ; RV32-NEXT: # %bb.7: # %for.cond1.preheader.us.preheader ; RV32-NEXT: mv t1, t0 ; RV32-NEXT: .LBB0_8: # %for.cond1.preheader.us.preheader ; RV32-NEXT: .cfi_restore_state ; RV32-NEXT: li t0, 0 ; RV32-NEXT: sltu t5, a0, t6 ; RV32-NEXT: sltu t6, a2, t3 ; RV32-NEXT: and t5, t5, t6 ; RV32-NEXT: sltu t4, a0, t4 ; RV32-NEXT: sltu t3, a4, t3 ; RV32-NEXT: and t3, t4, t3 ; RV32-NEXT: or t4, a1, a3 ; RV32-NEXT: srli t4, t4, 31 ; RV32-NEXT: or t4, t5, t4 ; RV32-NEXT: or t5, a1, a5 ; RV32-NEXT: sltu t1, a6, t1 ; RV32-NEXT: srli t5, t5, 31 ; RV32-NEXT: or t3, t3, t5 ; RV32-NEXT: or t3, t4, t3 ; RV32-NEXT: or t1, t1, t3 ; RV32-NEXT: andi t1, t1, 1 ; RV32-NEXT: slli t2, t2, 1 ; RV32-NEXT: csrwi vxrm, 0 ; RV32-NEXT: j .LBB0_10 ; RV32-NEXT: .LBB0_9: # %for.cond1.for.cond.cleanup3_crit_edge.us ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: add a0, a0, a1 ; RV32-NEXT: add a2, a2, a3 ; RV32-NEXT: addi t0, t0, 1 ; RV32-NEXT: add a4, a4, a5 ; RV32-NEXT: beq t0, a7, .LBB0_16 ; RV32-NEXT: .LBB0_10: # %for.cond1.preheader.us ; RV32-NEXT: # =>This Loop Header: Depth=1 ; RV32-NEXT: # Child Loop BB0_13 Depth 2 ; RV32-NEXT: # Child Loop BB0_15 Depth 2 ; RV32-NEXT: beqz t1, .LBB0_12 ; RV32-NEXT: # %bb.11: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: li t4, 0 ; RV32-NEXT: li t3, 0 ; RV32-NEXT: j .LBB0_15 ; RV32-NEXT: .LBB0_12: # %vector.ph ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: li t3, 0 ; RV32-NEXT: neg t4, t2 ; RV32-NEXT: and t4, t4, a6 ; RV32-NEXT: li t6, 0 ; RV32-NEXT: li t5, 0 ; RV32-NEXT: vsetvli s0, zero, e8, m2, ta, ma ; RV32-NEXT: .LBB0_13: # %vector.body ; RV32-NEXT: # Parent Loop BB0_10 Depth=1 ; RV32-NEXT: # => This Inner Loop Header: Depth=2 ; RV32-NEXT: add s0, a2, t6 ; RV32-NEXT: add s1, a4, t6 ; RV32-NEXT: vl2r.v v8, (s0) ; RV32-NEXT: add s0, a0, t6 ; RV32-NEXT: vl2r.v v10, (s1) ; RV32-NEXT: add s1, t6, t2 ; RV32-NEXT: sltu t6, s1, t6 ; RV32-NEXT: add t5, t5, t6 ; RV32-NEXT: xor t6, s1, t4 ; RV32-NEXT: vaaddu.vv v8, v8, v10 ; RV32-NEXT: or s2, t6, t5 ; RV32-NEXT: vs2r.v v8, (s0) ; RV32-NEXT: mv t6, s1 ; RV32-NEXT: bnez s2, .LBB0_13 ; RV32-NEXT: # %bb.14: # %middle.block ; RV32-NEXT: # in Loop: Header=BB0_10 Depth=1 ; RV32-NEXT: beq t4, a6, .LBB0_9 ; RV32-NEXT: .LBB0_15: # %for.body4.us ; RV32-NEXT: # Parent Loop BB0_10 Depth=1 ; RV32-NEXT: # => This Inner Loop Header: Depth=2 ; RV32-NEXT: add t5, a2, t4 ; RV32-NEXT: add t6, a4, t4 ; RV32-NEXT: add s0, a0, t4 ; RV32-NEXT: lbu t5, 0(t5) ; RV32-NEXT: lbu t6, 0(t6) ; RV32-NEXT: addi t4, t4, 1 ; RV32-NEXT: seqz s1, t4 ; RV32-NEXT: add t3, t3, s1 ; RV32-NEXT: add t5, t5, t6 ; RV32-NEXT: xor t6, t4, a6 ; RV32-NEXT: addi t5, t5, 1 ; RV32-NEXT: srli t5, t5, 1 ; RV32-NEXT: or t6, t6, t3 ; RV32-NEXT: sb t5, 0(s0) ; RV32-NEXT: bnez t6, .LBB0_15 ; RV32-NEXT: j .LBB0_9 ; RV32-NEXT: .LBB0_16: ; RV32-NEXT: lw s0, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s2, 4(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s3, 0(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: .cfi_restore s1 ; RV32-NEXT: .cfi_restore s2 ; RV32-NEXT: .cfi_restore s3 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: .LBB0_17: # %for.cond.cleanup ; RV32-NEXT: ret ; ; RV64P670-LABEL: test1: ; RV64P670: # %bb.0: # %entry ; RV64P670-NEXT: csrwi vxrm, 0 ; RV64P670-NEXT: blez a7, .LBB0_12 ; RV64P670-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph ; RV64P670-NEXT: blez a6, .LBB0_12 ; RV64P670-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader ; RV64P670-NEXT: addi sp, sp, -48 ; RV64P670-NEXT: .cfi_def_cfa_offset 48 ; RV64P670-NEXT: sd s0, 40(sp) # 8-byte Folded Spill ; RV64P670-NEXT: sd s1, 32(sp) # 8-byte Folded Spill ; RV64P670-NEXT: sd s2, 24(sp) # 8-byte Folded Spill ; RV64P670-NEXT: sd s3, 16(sp) # 8-byte Folded Spill ; RV64P670-NEXT: sd s4, 8(sp) # 8-byte Folded Spill ; RV64P670-NEXT: .cfi_offset s0, -8 ; RV64P670-NEXT: .cfi_offset s1, -16 ; RV64P670-NEXT: .cfi_offset s2, -24 ; RV64P670-NEXT: .cfi_offset s3, -32 ; RV64P670-NEXT: .cfi_offset s4, -40 ; RV64P670-NEXT: addi s1, a7, -1 ; RV64P670-NEXT: add s0, a0, a6 ; RV64P670-NEXT: li t0, 0 ; RV64P670-NEXT: li t1, 0 ; RV64P670-NEXT: zext.w s1, s1 ; RV64P670-NEXT: mul t2, a1, s1 ; RV64P670-NEXT: add t4, s0, t2 ; RV64P670-NEXT: mul t2, a3, s1 ; RV64P670-NEXT: add s0, a2, a6 ; RV64P670-NEXT: mul s1, a5, s1 ; RV64P670-NEXT: add t3, s0, t2 ; RV64P670-NEXT: add s0, a4, a6 ; RV64P670-NEXT: csrr t2, vlenb ; RV64P670-NEXT: add t5, s0, s1 ; RV64P670-NEXT: sltu s1, a0, t3 ; RV64P670-NEXT: sltu s0, a2, t4 ; RV64P670-NEXT: slli t3, t2, 1 ; RV64P670-NEXT: and s0, s0, s1 ; RV64P670-NEXT: or s1, a1, a3 ; RV64P670-NEXT: srli s1, s1, 63 ; RV64P670-NEXT: or t6, s0, s1 ; RV64P670-NEXT: sltu s1, a0, t5 ; RV64P670-NEXT: sltu s0, a4, t4 ; RV64P670-NEXT: add t4, a0, a6 ; RV64P670-NEXT: and s0, s0, s1 ; RV64P670-NEXT: or s1, a1, a5 ; RV64P670-NEXT: srli s1, s1, 63 ; RV64P670-NEXT: or s0, s0, s1 ; RV64P670-NEXT: li s1, 32 ; RV64P670-NEXT: maxu s1, t3, s1 ; RV64P670-NEXT: or s0, t6, s0 ; RV64P670-NEXT: sltu s1, a6, s1 ; RV64P670-NEXT: or s0, s0, s1 ; RV64P670-NEXT: andi t5, s0, 1 ; RV64P670-NEXT: j .LBB0_4 ; RV64P670-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us ; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64P670-NEXT: add a0, a0, a1 ; RV64P670-NEXT: add a2, a2, a3 ; RV64P670-NEXT: add a4, a4, a5 ; RV64P670-NEXT: addiw t1, t1, 1 ; RV64P670-NEXT: addi t0, t0, 1 ; RV64P670-NEXT: beq t1, a7, .LBB0_11 ; RV64P670-NEXT: .LBB0_4: # %for.cond1.preheader.us ; RV64P670-NEXT: # =>This Loop Header: Depth=1 ; RV64P670-NEXT: # Child Loop BB0_7 Depth 2 ; RV64P670-NEXT: # Child Loop BB0_10 Depth 2 ; RV64P670-NEXT: beqz t5, .LBB0_6 ; RV64P670-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1 ; RV64P670-NEXT: li t6, 0 ; RV64P670-NEXT: j .LBB0_9 ; RV64P670-NEXT: .LBB0_6: # %vector.ph ; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64P670-NEXT: slli s1, t2, 28 ; RV64P670-NEXT: mv s2, a2 ; RV64P670-NEXT: mv s3, a4 ; RV64P670-NEXT: mv s4, a0 ; RV64P670-NEXT: sub s1, s1, t3 ; RV64P670-NEXT: vsetvli s0, zero, e8, m2, ta, ma ; RV64P670-NEXT: and t6, s1, a6 ; RV64P670-NEXT: mv s1, t6 ; RV64P670-NEXT: .LBB0_7: # %vector.body ; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1 ; RV64P670-NEXT: # => This Inner Loop Header: Depth=2 ; RV64P670-NEXT: vl2r.v v8, (s2) ; RV64P670-NEXT: sub s1, s1, t3 ; RV64P670-NEXT: add s2, s2, t3 ; RV64P670-NEXT: vl2r.v v10, (s3) ; RV64P670-NEXT: add s3, s3, t3 ; RV64P670-NEXT: vaaddu.vv v8, v8, v10 ; RV64P670-NEXT: vs2r.v v8, (s4) ; RV64P670-NEXT: add s4, s4, t3 ; RV64P670-NEXT: bnez s1, .LBB0_7 ; RV64P670-NEXT: # %bb.8: # %middle.block ; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64P670-NEXT: beq t6, a6, .LBB0_3 ; RV64P670-NEXT: .LBB0_9: # %for.body4.us.preheader ; RV64P670-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64P670-NEXT: mul s2, a1, t0 ; RV64P670-NEXT: add s1, a0, t6 ; RV64P670-NEXT: add s4, a4, t6 ; RV64P670-NEXT: add t6, t6, a2 ; RV64P670-NEXT: add s2, s2, t4 ; RV64P670-NEXT: .LBB0_10: # %for.body4.us ; RV64P670-NEXT: # Parent Loop BB0_4 Depth=1 ; RV64P670-NEXT: # => This Inner Loop Header: Depth=2 ; RV64P670-NEXT: lbu s3, 0(t6) ; RV64P670-NEXT: lbu s0, 0(s4) ; RV64P670-NEXT: addi s4, s4, 1 ; RV64P670-NEXT: addi t6, t6, 1 ; RV64P670-NEXT: add s0, s0, s3 ; RV64P670-NEXT: addi s0, s0, 1 ; RV64P670-NEXT: srli s0, s0, 1 ; RV64P670-NEXT: sb s0, 0(s1) ; RV64P670-NEXT: addi s1, s1, 1 ; RV64P670-NEXT: bne s1, s2, .LBB0_10 ; RV64P670-NEXT: j .LBB0_3 ; RV64P670-NEXT: .LBB0_11: ; RV64P670-NEXT: ld s0, 40(sp) # 8-byte Folded Reload ; RV64P670-NEXT: ld s1, 32(sp) # 8-byte Folded Reload ; RV64P670-NEXT: ld s2, 24(sp) # 8-byte Folded Reload ; RV64P670-NEXT: ld s3, 16(sp) # 8-byte Folded Reload ; RV64P670-NEXT: ld s4, 8(sp) # 8-byte Folded Reload ; RV64P670-NEXT: .cfi_restore s0 ; RV64P670-NEXT: .cfi_restore s1 ; RV64P670-NEXT: .cfi_restore s2 ; RV64P670-NEXT: .cfi_restore s3 ; RV64P670-NEXT: .cfi_restore s4 ; RV64P670-NEXT: addi sp, sp, 48 ; RV64P670-NEXT: .cfi_def_cfa_offset 0 ; RV64P670-NEXT: .LBB0_12: # %for.cond.cleanup ; RV64P670-NEXT: ret ; ; RV64X60-LABEL: test1: ; RV64X60: # %bb.0: # %entry ; RV64X60-NEXT: csrwi vxrm, 0 ; RV64X60-NEXT: blez a7, .LBB0_12 ; RV64X60-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph ; RV64X60-NEXT: blez a6, .LBB0_12 ; RV64X60-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader ; RV64X60-NEXT: addi sp, sp, -48 ; RV64X60-NEXT: .cfi_def_cfa_offset 48 ; RV64X60-NEXT: sd s0, 40(sp) # 8-byte Folded Spill ; RV64X60-NEXT: sd s1, 32(sp) # 8-byte Folded Spill ; RV64X60-NEXT: sd s2, 24(sp) # 8-byte Folded Spill ; RV64X60-NEXT: sd s3, 16(sp) # 8-byte Folded Spill ; RV64X60-NEXT: sd s4, 8(sp) # 8-byte Folded Spill ; RV64X60-NEXT: .cfi_offset s0, -8 ; RV64X60-NEXT: .cfi_offset s1, -16 ; RV64X60-NEXT: .cfi_offset s2, -24 ; RV64X60-NEXT: .cfi_offset s3, -32 ; RV64X60-NEXT: .cfi_offset s4, -40 ; RV64X60-NEXT: li t0, 0 ; RV64X60-NEXT: li t1, 0 ; RV64X60-NEXT: addi s1, a7, -1 ; RV64X60-NEXT: zext.w s1, s1 ; RV64X60-NEXT: mul t3, a1, s1 ; RV64X60-NEXT: mul t4, a3, s1 ; RV64X60-NEXT: mul t5, a5, s1 ; RV64X60-NEXT: add s0, a0, a6 ; RV64X60-NEXT: csrr t2, vlenb ; RV64X60-NEXT: add s1, a2, a6 ; RV64X60-NEXT: add t3, t3, s0 ; RV64X60-NEXT: add s0, a4, a6 ; RV64X60-NEXT: add t4, t4, s1 ; RV64X60-NEXT: li t6, 32 ; RV64X60-NEXT: add t5, t5, s0 ; RV64X60-NEXT: sltu s0, a0, t4 ; RV64X60-NEXT: sltu s1, a2, t3 ; RV64X60-NEXT: and t4, s0, s1 ; RV64X60-NEXT: or s2, a1, a3 ; RV64X60-NEXT: sltu s0, a0, t5 ; RV64X60-NEXT: sltu s1, a4, t3 ; RV64X60-NEXT: srli t3, s2, 63 ; RV64X60-NEXT: and s0, s0, s1 ; RV64X60-NEXT: or s1, a1, a5 ; RV64X60-NEXT: or t4, t4, t3 ; RV64X60-NEXT: slli t3, t2, 1 ; RV64X60-NEXT: srli s1, s1, 63 ; RV64X60-NEXT: or s0, s0, s1 ; RV64X60-NEXT: maxu s1, t3, t6 ; RV64X60-NEXT: or s0, t4, s0 ; RV64X60-NEXT: sltu s1, a6, s1 ; RV64X60-NEXT: or s0, s0, s1 ; RV64X60-NEXT: add t4, a0, a6 ; RV64X60-NEXT: andi t5, s0, 1 ; RV64X60-NEXT: j .LBB0_4 ; RV64X60-NEXT: .LBB0_3: # %for.cond1.for.cond.cleanup3_crit_edge.us ; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64X60-NEXT: add a0, a0, a1 ; RV64X60-NEXT: add a2, a2, a3 ; RV64X60-NEXT: addiw t1, t1, 1 ; RV64X60-NEXT: add a4, a4, a5 ; RV64X60-NEXT: addi t0, t0, 1 ; RV64X60-NEXT: beq t1, a7, .LBB0_11 ; RV64X60-NEXT: .LBB0_4: # %for.cond1.preheader.us ; RV64X60-NEXT: # =>This Loop Header: Depth=1 ; RV64X60-NEXT: # Child Loop BB0_7 Depth 2 ; RV64X60-NEXT: # Child Loop BB0_10 Depth 2 ; RV64X60-NEXT: beqz t5, .LBB0_6 ; RV64X60-NEXT: # %bb.5: # in Loop: Header=BB0_4 Depth=1 ; RV64X60-NEXT: li t6, 0 ; RV64X60-NEXT: j .LBB0_9 ; RV64X60-NEXT: .LBB0_6: # %vector.ph ; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64X60-NEXT: slli s1, t2, 28 ; RV64X60-NEXT: sub s1, s1, t3 ; RV64X60-NEXT: and t6, s1, a6 ; RV64X60-NEXT: mv s2, a2 ; RV64X60-NEXT: mv s3, a4 ; RV64X60-NEXT: mv s4, a0 ; RV64X60-NEXT: mv s1, t6 ; RV64X60-NEXT: vsetvli s0, zero, e8, m2, ta, ma ; RV64X60-NEXT: .LBB0_7: # %vector.body ; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1 ; RV64X60-NEXT: # => This Inner Loop Header: Depth=2 ; RV64X60-NEXT: vl2r.v v8, (s2) ; RV64X60-NEXT: vl2r.v v10, (s3) ; RV64X60-NEXT: vaaddu.vv v8, v8, v10 ; RV64X60-NEXT: sub s1, s1, t3 ; RV64X60-NEXT: vs2r.v v8, (s4) ; RV64X60-NEXT: add s4, s4, t3 ; RV64X60-NEXT: add s3, s3, t3 ; RV64X60-NEXT: add s2, s2, t3 ; RV64X60-NEXT: bnez s1, .LBB0_7 ; RV64X60-NEXT: # %bb.8: # %middle.block ; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64X60-NEXT: beq t6, a6, .LBB0_3 ; RV64X60-NEXT: .LBB0_9: # %for.body4.us.preheader ; RV64X60-NEXT: # in Loop: Header=BB0_4 Depth=1 ; RV64X60-NEXT: mul s2, a1, t0 ; RV64X60-NEXT: add s0, a0, t6 ; RV64X60-NEXT: add s2, s2, t4 ; RV64X60-NEXT: add s4, a4, t6 ; RV64X60-NEXT: add t6, t6, a2 ; RV64X60-NEXT: .LBB0_10: # %for.body4.us ; RV64X60-NEXT: # Parent Loop BB0_4 Depth=1 ; RV64X60-NEXT: # => This Inner Loop Header: Depth=2 ; RV64X60-NEXT: lbu s3, 0(t6) ; RV64X60-NEXT: lbu s1, 0(s4) ; RV64X60-NEXT: add s1, s1, s3 ; RV64X60-NEXT: addi s1, s1, 1 ; RV64X60-NEXT: srli s1, s1, 1 ; RV64X60-NEXT: sb s1, 0(s0) ; RV64X60-NEXT: addi s0, s0, 1 ; RV64X60-NEXT: addi s4, s4, 1 ; RV64X60-NEXT: addi t6, t6, 1 ; RV64X60-NEXT: bne s0, s2, .LBB0_10 ; RV64X60-NEXT: j .LBB0_3 ; RV64X60-NEXT: .LBB0_11: ; RV64X60-NEXT: ld s0, 40(sp) # 8-byte Folded Reload ; RV64X60-NEXT: ld s1, 32(sp) # 8-byte Folded Reload ; RV64X60-NEXT: ld s2, 24(sp) # 8-byte Folded Reload ; RV64X60-NEXT: ld s3, 16(sp) # 8-byte Folded Reload ; RV64X60-NEXT: ld s4, 8(sp) # 8-byte Folded Reload ; RV64X60-NEXT: .cfi_restore s0 ; RV64X60-NEXT: .cfi_restore s1 ; RV64X60-NEXT: .cfi_restore s2 ; RV64X60-NEXT: .cfi_restore s3 ; RV64X60-NEXT: .cfi_restore s4 ; RV64X60-NEXT: addi sp, sp, 48 ; RV64X60-NEXT: .cfi_def_cfa_offset 0 ; RV64X60-NEXT: .LBB0_12: # %for.cond.cleanup ; RV64X60-NEXT: ret ; ; RV64-LABEL: test1: ; RV64: # %bb.0: # %entry ; RV64-NEXT: blez a7, .LBB0_14 ; RV64-NEXT: # %bb.1: # %for.cond1.preheader.lr.ph ; RV64-NEXT: blez a6, .LBB0_14 ; RV64-NEXT: # %bb.2: # %for.cond1.preheader.us.preheader ; RV64-NEXT: addi sp, sp, -48 ; RV64-NEXT: .cfi_def_cfa_offset 48 ; RV64-NEXT: sd s0, 40(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s1, 32(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s2, 24(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s3, 16(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s4, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset s0, -8 ; RV64-NEXT: .cfi_offset s1, -16 ; RV64-NEXT: .cfi_offset s2, -24 ; RV64-NEXT: .cfi_offset s3, -32 ; RV64-NEXT: .cfi_offset s4, -40 ; RV64-NEXT: addi t1, a7, -1 ; RV64-NEXT: add t5, a0, a6 ; RV64-NEXT: add s0, a2, a6 ; RV64-NEXT: add t6, a4, a6 ; RV64-NEXT: csrr t0, vlenb ; RV64-NEXT: li t2, 32 ; RV64-NEXT: slli t1, t1, 32 ; RV64-NEXT: srli t3, t1, 32 ; RV64-NEXT: mul t1, a1, t3 ; RV64-NEXT: add t5, t5, t1 ; RV64-NEXT: mul t1, a3, t3 ; RV64-NEXT: add s0, s0, t1 ; RV64-NEXT: slli t1, t0, 1 ; RV64-NEXT: mul t3, a5, t3 ; RV64-NEXT: add t6, t6, t3 ; RV64-NEXT: mv t4, t1 ; RV64-NEXT: bltu t2, t1, .LBB0_4 ; RV64-NEXT: # %bb.3: # %for.cond1.preheader.us.preheader ; RV64-NEXT: li t4, 32 ; RV64-NEXT: .LBB0_4: # %for.cond1.preheader.us.preheader ; RV64-NEXT: li t2, 0 ; RV64-NEXT: li t3, 0 ; RV64-NEXT: sltu s0, a0, s0 ; RV64-NEXT: sltu s1, a2, t5 ; RV64-NEXT: and s0, s0, s1 ; RV64-NEXT: sltu t6, a0, t6 ; RV64-NEXT: sltu t5, a4, t5 ; RV64-NEXT: and t5, t6, t5 ; RV64-NEXT: or t6, a1, a3 ; RV64-NEXT: srli t6, t6, 63 ; RV64-NEXT: or t6, s0, t6 ; RV64-NEXT: or s0, a1, a5 ; RV64-NEXT: srli s0, s0, 63 ; RV64-NEXT: or t5, t5, s0 ; RV64-NEXT: sltu s0, a6, t4 ; RV64-NEXT: or t5, t6, t5 ; RV64-NEXT: add t4, a0, a6 ; RV64-NEXT: or t5, s0, t5 ; RV64-NEXT: andi t5, t5, 1 ; RV64-NEXT: csrwi vxrm, 0 ; RV64-NEXT: j .LBB0_6 ; RV64-NEXT: .LBB0_5: # %for.cond1.for.cond.cleanup3_crit_edge.us ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 ; RV64-NEXT: add a0, a0, a1 ; RV64-NEXT: add a2, a2, a3 ; RV64-NEXT: add a4, a4, a5 ; RV64-NEXT: addiw t3, t3, 1 ; RV64-NEXT: addi t2, t2, 1 ; RV64-NEXT: beq t3, a7, .LBB0_13 ; RV64-NEXT: .LBB0_6: # %for.cond1.preheader.us ; RV64-NEXT: # =>This Loop Header: Depth=1 ; RV64-NEXT: # Child Loop BB0_9 Depth 2 ; RV64-NEXT: # Child Loop BB0_12 Depth 2 ; RV64-NEXT: beqz t5, .LBB0_8 ; RV64-NEXT: # %bb.7: # in Loop: Header=BB0_6 Depth=1 ; RV64-NEXT: li t6, 0 ; RV64-NEXT: j .LBB0_11 ; RV64-NEXT: .LBB0_8: # %vector.ph ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 ; RV64-NEXT: slli t6, t0, 28 ; RV64-NEXT: sub t6, t6, t1 ; RV64-NEXT: and t6, t6, a6 ; RV64-NEXT: mv s0, a2 ; RV64-NEXT: mv s1, a4 ; RV64-NEXT: mv s2, a0 ; RV64-NEXT: mv s3, t6 ; RV64-NEXT: vsetvli s4, zero, e8, m2, ta, ma ; RV64-NEXT: .LBB0_9: # %vector.body ; RV64-NEXT: # Parent Loop BB0_6 Depth=1 ; RV64-NEXT: # => This Inner Loop Header: Depth=2 ; RV64-NEXT: vl2r.v v8, (s0) ; RV64-NEXT: vl2r.v v10, (s1) ; RV64-NEXT: sub s3, s3, t1 ; RV64-NEXT: add s1, s1, t1 ; RV64-NEXT: vaaddu.vv v8, v8, v10 ; RV64-NEXT: vs2r.v v8, (s2) ; RV64-NEXT: add s2, s2, t1 ; RV64-NEXT: add s0, s0, t1 ; RV64-NEXT: bnez s3, .LBB0_9 ; RV64-NEXT: # %bb.10: # %middle.block ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 ; RV64-NEXT: beq t6, a6, .LBB0_5 ; RV64-NEXT: .LBB0_11: # %for.body4.us.preheader ; RV64-NEXT: # in Loop: Header=BB0_6 Depth=1 ; RV64-NEXT: mul s2, a1, t2 ; RV64-NEXT: add s0, a0, t6 ; RV64-NEXT: add s1, a4, t6 ; RV64-NEXT: add s2, t4, s2 ; RV64-NEXT: add t6, a2, t6 ; RV64-NEXT: .LBB0_12: # %for.body4.us ; RV64-NEXT: # Parent Loop BB0_6 Depth=1 ; RV64-NEXT: # => This Inner Loop Header: Depth=2 ; RV64-NEXT: lbu s3, 0(t6) ; RV64-NEXT: lbu s4, 0(s1) ; RV64-NEXT: add s3, s3, s4 ; RV64-NEXT: addi s3, s3, 1 ; RV64-NEXT: srli s3, s3, 1 ; RV64-NEXT: sb s3, 0(s0) ; RV64-NEXT: addi s0, s0, 1 ; RV64-NEXT: addi s1, s1, 1 ; RV64-NEXT: addi t6, t6, 1 ; RV64-NEXT: bne s0, s2, .LBB0_12 ; RV64-NEXT: j .LBB0_5 ; RV64-NEXT: .LBB0_13: ; RV64-NEXT: ld s0, 40(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s1, 32(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s2, 24(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s3, 16(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s4, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: .cfi_restore s1 ; RV64-NEXT: .cfi_restore s2 ; RV64-NEXT: .cfi_restore s3 ; RV64-NEXT: .cfi_restore s4 ; RV64-NEXT: addi sp, sp, 48 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: .LBB0_14: # %for.cond.cleanup ; RV64-NEXT: ret entry: %cmp29 = icmp sgt i32 %i_height, 0 br i1 %cmp29, label %for.cond1.preheader.lr.ph, label %for.cond.cleanup for.cond1.preheader.lr.ph: ; preds = %entry %cmp227 = icmp sgt i32 %i_width, 0 %idx.ext = sext i32 %i_dst_stride to i64 %idx.ext12 = sext i32 %i_src1_stride to i64 %idx.ext14 = sext i32 %i_src2_stride to i64 br i1 %cmp227, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup for.cond1.preheader.us.preheader: ; preds = %for.cond1.preheader.lr.ph %wide.trip.count = zext nneg i32 %i_width to i64 %0 = add nsw i32 %i_height, -1 %1 = zext i32 %0 to i64 %2 = mul nsw i64 %idx.ext, %1 %3 = getelementptr i8, ptr %dst, i64 %2 %scevgep = getelementptr i8, ptr %3, i64 %wide.trip.count %4 = mul nsw i64 %idx.ext12, %1 %5 = getelementptr i8, ptr %src1, i64 %4 %scevgep36 = getelementptr i8, ptr %5, i64 %wide.trip.count %6 = mul nsw i64 %idx.ext14, %1 %7 = getelementptr i8, ptr %src2, i64 %6 %scevgep37 = getelementptr i8, ptr %7, i64 %wide.trip.count %8 = tail call i64 @llvm.vscale.i64() %9 = shl nuw nsw i64 %8, 4 %10 = tail call i64 @llvm.umax.i64(i64 %9, i64 32) %min.iters.check = icmp ugt i64 %10, %wide.trip.count %bound0 = icmp ult ptr %dst, %scevgep36 %bound1 = icmp ult ptr %src1, %scevgep %found.conflict = and i1 %bound0, %bound1 %11 = or i32 %i_dst_stride, %i_src1_stride %12 = icmp slt i32 %11, 0 %13 = or i1 %found.conflict, %12 %bound039 = icmp ult ptr %dst, %scevgep37 %bound140 = icmp ult ptr %src2, %scevgep %found.conflict41 = and i1 %bound039, %bound140 %14 = or i32 %i_dst_stride, %i_src2_stride %15 = icmp slt i32 %14, 0 %16 = or i1 %found.conflict41, %15 %conflict.rdx = or i1 %13, %16 br label %for.cond1.preheader.us for.cond1.preheader.us: ; preds = %for.cond1.preheader.us.preheader, %for.cond1.for.cond.cleanup3_crit_edge.us %y.033.us = phi i32 [ %inc17.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ] %dst.addr.032.us = phi ptr [ %add.ptr.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %dst, %for.cond1.preheader.us.preheader ] %src1.addr.031.us = phi ptr [ %add.ptr13.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src1, %for.cond1.preheader.us.preheader ] %src2.addr.030.us = phi ptr [ %add.ptr15.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %src2, %for.cond1.preheader.us.preheader ] %brmerge = select i1 %min.iters.check, i1 true, i1 %conflict.rdx br i1 %brmerge, label %for.body4.us.preheader, label %vector.ph vector.ph: ; preds = %for.cond1.preheader.us %17 = tail call i64 @llvm.vscale.i64() %.neg = mul nuw nsw i64 %17, 2147483632 %n.vec = and i64 %.neg, %wide.trip.count %18 = tail call i64 @llvm.vscale.i64() %19 = shl nuw nsw i64 %18, 4 br label %vector.body vector.body: ; preds = %vector.body, %vector.ph %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] %20 = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %index %wide.load = load , ptr %20, align 1 %21 = zext %wide.load to %22 = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %index %wide.load44 = load , ptr %22, align 1 %23 = zext %wide.load44 to %24 = add nuw nsw %21, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) %25 = add nuw nsw %24, %23 %26 = lshr %25, shufflevector ( insertelement ( poison, i16 1, i64 0), poison, zeroinitializer) %27 = trunc %26 to %28 = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %index store %27, ptr %28, align 1 %index.next = add nuw i64 %index, %19 %29 = icmp eq i64 %index.next, %n.vec br i1 %29, label %middle.block, label %vector.body middle.block: ; preds = %vector.body %cmp.n = icmp eq i64 %n.vec, %wide.trip.count br i1 %cmp.n, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us.preheader for.body4.us.preheader: ; preds = %for.cond1.preheader.us, %middle.block %indvars.iv.ph = phi i64 [ 0, %for.cond1.preheader.us ], [ %n.vec, %middle.block ] br label %for.body4.us for.body4.us: ; preds = %for.body4.us.preheader, %for.body4.us %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4.us ], [ %indvars.iv.ph, %for.body4.us.preheader ] %arrayidx.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %indvars.iv %30 = load i8, ptr %arrayidx.us, align 1 %conv.us = zext i8 %30 to i16 %arrayidx6.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %indvars.iv %31 = load i8, ptr %arrayidx6.us, align 1 %conv7.us = zext i8 %31 to i16 %add.us = add nuw nsw i16 %conv.us, 1 %add8.us = add nuw nsw i16 %add.us, %conv7.us %shr.us = lshr i16 %add8.us, 1 %conv9.us = trunc nuw i16 %shr.us to i8 %arrayidx11.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %indvars.iv store i8 %conv9.us, ptr %arrayidx11.us, align 1 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count br i1 %exitcond.not, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us, %middle.block %add.ptr.us = getelementptr inbounds i8, ptr %dst.addr.032.us, i64 %idx.ext %add.ptr13.us = getelementptr inbounds i8, ptr %src1.addr.031.us, i64 %idx.ext12 %add.ptr15.us = getelementptr inbounds i8, ptr %src2.addr.030.us, i64 %idx.ext14 %inc17.us = add nuw nsw i32 %y.033.us, 1 %exitcond35.not = icmp eq i32 %inc17.us, %i_height br i1 %exitcond35.not, label %for.cond.cleanup, label %for.cond1.preheader.us for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %for.cond1.preheader.lr.ph, %entry ret void }