1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
; FIXME: Currently, we avoid narrowing this v4i32 load, in the
; hopes of being able to fold the shift, despite it requiring stack
; storage + loads. Ideally, we should narrow here and load the i32
; directly from the variable offset e.g:
;
; add x8, x0, x1, lsl #4
; and x9, x2, #0x3
; ldr w0, [x8, x9, lsl #2]
;
; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
; probably be updated to choose load-narrowing instead of folding the
; lsl in larger vector cases.
;
define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: ldr q0, [x0, x1, lsl #4]
; CHECK-NEXT: mov x8, sp
; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2
; CHECK-NEXT: bfi x8, x2, #2, #2
; CHECK-NEXT: str q0, [sp]
; CHECK-NEXT: ldr w0, [x8]
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
entry:
%idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
%x = load <4 x i32>, ptr %idx, align 8
%res = extractelement <4 x i32> %x, i32 %ele
ret i32 %res
}
|