; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+v < %s | FileCheck --check-prefix=RV32 %s ; RUN: llc -mtriple=riscv64 -mattr=+m,+v < %s | FileCheck --check-prefix=RV64 %s ; FIXME: We can rematerialize "addi s0, a2, 32" (ideally along the edge ; %do.call -> %exit), and shrink wrap this routine define void @vecaddr_straightline(i32 zeroext %a, ptr %p) { ; RV32-LABEL: vecaddr_straightline: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 ; RV32-NEXT: addi s0, a1, 32 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 ; RV32-NEXT: li a1, 57 ; RV32-NEXT: vse32.v v8, (s0) ; RV32-NEXT: beq a0, a1, .LBB0_2 ; RV32-NEXT: # %bb.1: # %do_call ; RV32-NEXT: call foo ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: .LBB0_2: # %exit ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 ; RV32-NEXT: vse32.v v8, (s0) ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vecaddr_straightline: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 ; RV64-NEXT: addi s0, a1, 32 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 ; RV64-NEXT: li a1, 57 ; RV64-NEXT: vse32.v v8, (s0) ; RV64-NEXT: beq a0, a1, .LBB0_2 ; RV64-NEXT: # %bb.1: # %do_call ; RV64-NEXT: call foo ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: .LBB0_2: # %exit ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 ; RV64-NEXT: vse32.v v8, (s0) ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %gep = getelementptr i8, ptr %p, i32 32 %v1 = load <4 x i32>, ptr %gep %v2 = add <4 x i32> %v1, splat (i32 1) store <4 x i32> %v2, ptr %gep %cmp0 = icmp eq i32 %a, 57 br i1 %cmp0, label %exit, label %do_call do_call: call i32 @foo() br label %exit exit: %v3 = load <4 x i32>, ptr %gep %v4 = add <4 x i32> %v3, splat (i32 1) store <4 x i32> %v4, ptr %gep ret void } ; In this case, the second use is in a loop, so using a callee ; saved register to avoid a remat is the profitable choice. ; FIXME: We can shrink wrap the frame setup around the loop ; and avoid it along the %bb.0 -> %exit edge define void @vecaddr_loop(i32 zeroext %a, ptr %p) { ; RV32-LABEL: vecaddr_loop: ; RV32: # %bb.0: ; RV32-NEXT: addi sp, sp, -16 ; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: .cfi_offset s0, -8 ; RV32-NEXT: addi s0, a1, 32 ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 ; RV32-NEXT: li a1, 57 ; RV32-NEXT: vse32.v v8, (s0) ; RV32-NEXT: beq a0, a1, .LBB1_2 ; RV32-NEXT: .LBB1_1: # %do_call ; RV32-NEXT: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: call foo ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV32-NEXT: vle32.v v8, (s0) ; RV32-NEXT: vadd.vi v8, v8, 1 ; RV32-NEXT: vse32.v v8, (s0) ; RV32-NEXT: bnez a0, .LBB1_1 ; RV32-NEXT: .LBB1_2: # %exit ; RV32-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: .cfi_restore s0 ; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: .cfi_def_cfa_offset 0 ; RV32-NEXT: ret ; ; RV64-LABEL: vecaddr_loop: ; RV64: # %bb.0: ; RV64-NEXT: addi sp, sp, -16 ; RV64-NEXT: .cfi_def_cfa_offset 16 ; RV64-NEXT: sd ra, 8(sp) # 8-byte Folded Spill ; RV64-NEXT: sd s0, 0(sp) # 8-byte Folded Spill ; RV64-NEXT: .cfi_offset ra, -8 ; RV64-NEXT: .cfi_offset s0, -16 ; RV64-NEXT: addi s0, a1, 32 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 ; RV64-NEXT: li a1, 57 ; RV64-NEXT: vse32.v v8, (s0) ; RV64-NEXT: beq a0, a1, .LBB1_2 ; RV64-NEXT: .LBB1_1: # %do_call ; RV64-NEXT: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: call foo ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (s0) ; RV64-NEXT: vadd.vi v8, v8, 1 ; RV64-NEXT: vse32.v v8, (s0) ; RV64-NEXT: bnez a0, .LBB1_1 ; RV64-NEXT: .LBB1_2: # %exit ; RV64-NEXT: ld ra, 8(sp) # 8-byte Folded Reload ; RV64-NEXT: ld s0, 0(sp) # 8-byte Folded Reload ; RV64-NEXT: .cfi_restore ra ; RV64-NEXT: .cfi_restore s0 ; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: .cfi_def_cfa_offset 0 ; RV64-NEXT: ret %gep = getelementptr i8, ptr %p, i32 32 %v1 = load <4 x i32>, ptr %gep %v2 = add <4 x i32> %v1, splat (i32 1) store <4 x i32> %v2, ptr %gep %cmp0 = icmp eq i32 %a, 57 br i1 %cmp0, label %exit, label %do_call do_call: %b = call i32 @foo() %v3 = load <4 x i32>, ptr %gep %v4 = add <4 x i32> %v3, splat (i32 1) store <4 x i32> %v4, ptr %gep %cmp1 = icmp eq i32 %b, 0 br i1 %cmp1, label %exit, label %do_call exit: ret void } declare zeroext i32 @foo()