aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll268
-rw-r--r--llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll56
-rw-r--r--llvm/test/CMakeLists.txt2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll13
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-fdot.ll93
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll230
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll941
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll32
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll130
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-args-inreg.ll604
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-argument-types.ll3547
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-c-function.ll61
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-constexpr.ll343
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll95
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-encoding.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll90
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-return-types.ll13
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-skip.ll112
-rw-r--r--llvm/test/CodeGen/AMDGPU/call-waitcnt.ll211
-rw-r--r--llvm/test/CodeGen/AMDGPU/carryout-selection.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll49
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll48
-rw-r--r--llvm/test/CodeGen/AMDGPU/mfma-loop.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/optimize-compare.mir178
-rw-r--r--llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sdiv64.ll368
-rw-r--r--llvm/test/CodeGen/AMDGPU/shlN_add.ll (renamed from llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll)372
-rw-r--r--llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll23
-rw-r--r--llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll198
-rw-r--r--llvm/test/CodeGen/AMDGPU/srem64.ll410
-rw-r--r--llvm/test/CodeGen/AMDGPU/uaddo.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/udiv64.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/urem64.ll296
-rw-r--r--llvm/test/CodeGen/AMDGPU/usubo.ll6
-rw-r--r--llvm/test/CodeGen/PowerPC/vp-ld-st.ll160
-rw-r--r--llvm/test/DebugInfo/extradata-node-reference.ll2
-rw-r--r--llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll31
-rw-r--r--llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll141
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll6
-rw-r--r--llvm/test/Transforms/PhaseOrdering/X86/addsub.ll6
-rw-r--r--llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll150
-rw-r--r--llvm/test/lit.cfg.py10
-rw-r--r--llvm/test/lit.site.cfg.py.in1
-rw-r--r--llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test41
-rw-r--r--llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.obin0 -> 2080 bytes
-rw-r--r--llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test8
-rw-r--r--llvm/test/tools/llvm-cas/Inputs/oneline1
-rw-r--r--llvm/test/tools/llvm-cas/Inputs/oneline-nonewline1
-rw-r--r--llvm/test/tools/llvm-cas/action-cache.test14
-rw-r--r--llvm/test/tools/llvm-cas/cache.test14
-rw-r--r--llvm/test/tools/llvm-cas/dump.test27
-rw-r--r--llvm/test/tools/llvm-cas/lit.local.cfg2
-rw-r--r--llvm/test/tools/llvm-cas/make-blob.test41
-rw-r--r--llvm/test/tools/llvm-cas/make-node.test37
-rw-r--r--llvm/test/tools/llvm-cas/print-id.test13
-rw-r--r--llvm/test/tools/llvm-cas/validation.test31
63 files changed, 6263 insertions, 3559 deletions
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
index 1c40354..ec848c2 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-arith-fp.ll
@@ -5,214 +5,214 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
define void @fadd() {
; CHECK-LABEL: 'fadd'
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fadd <vscale x 4 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fadd <vscale x 8 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fadd <vscale x 16 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fadd <vscale x 1 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fadd <vscale x 2 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fadd <vscale x 4 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fadd <vscale x 8 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fadd <vscale x 2 x double> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fadd <vscale x 4 x double> poison, poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = fadd <vscale x 4 x half> undef, undef
- %V8F16 = fadd <vscale x 8 x half> undef, undef
- %V16F16 = fadd <vscale x 16 x half> undef, undef
+ %V4F16 = fadd <vscale x 4 x half> poison, poison
+ %V8F16 = fadd <vscale x 8 x half> poison, poison
+ %V16F16 = fadd <vscale x 16 x half> poison, poison
- %V1F32 = fadd <vscale x 1 x float> undef, undef
- %V2F32 = fadd <vscale x 2 x float> undef, undef
- %V4F32 = fadd <vscale x 4 x float> undef, undef
- %V8F32 = fadd <vscale x 8 x float> undef, undef
+ %V1F32 = fadd <vscale x 1 x float> poison, poison
+ %V2F32 = fadd <vscale x 2 x float> poison, poison
+ %V4F32 = fadd <vscale x 4 x float> poison, poison
+ %V8F32 = fadd <vscale x 8 x float> poison, poison
- %V2F64 = fadd <vscale x 2 x double> undef, undef
- %V4F64 = fadd <vscale x 4 x double> undef, undef
+ %V2F64 = fadd <vscale x 2 x double> poison, poison
+ %V4F64 = fadd <vscale x 4 x double> poison, poison
ret void
}
define void @fsub() {
; CHECK-LABEL: 'fsub'
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fsub <vscale x 4 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fsub <vscale x 8 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fsub <vscale x 16 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of Invalid for: %V1F32 = fsub <vscale x 1 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fsub <vscale x 2 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fsub <vscale x 4 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fsub <vscale x 8 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fsub <vscale x 2 x double> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fsub <vscale x 4 x double> poison, poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = fsub <vscale x 4 x half> undef, undef
- %V8F16 = fsub <vscale x 8 x half> undef, undef
- %V16F16 = fsub <vscale x 16 x half> undef, undef
+ %V4F16 = fsub <vscale x 4 x half> poison, poison
+ %V8F16 = fsub <vscale x 8 x half> poison, poison
+ %V16F16 = fsub <vscale x 16 x half> poison, poison
- %V1F32 = fsub <vscale x 1 x float> undef, undef
- %V2F32 = fsub <vscale x 2 x float> undef, undef
- %V4F32 = fsub <vscale x 4 x float> undef, undef
- %V8F32 = fsub <vscale x 8 x float> undef, undef
+ %V1F32 = fsub <vscale x 1 x float> poison, poison
+ %V2F32 = fsub <vscale x 2 x float> poison, poison
+ %V4F32 = fsub <vscale x 4 x float> poison, poison
+ %V8F32 = fsub <vscale x 8 x float> poison, poison
- %V2F64 = fsub <vscale x 2 x double> undef, undef
- %V4F64 = fsub <vscale x 4 x double> undef, undef
+ %V2F64 = fsub <vscale x 2 x double> poison, poison
+ %V4F64 = fsub <vscale x 4 x double> poison, poison
ret void
}
define void @fneg() {
; CHECK-LABEL: 'fneg'
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F16 = fneg <vscale x 2 x half> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fneg <vscale x 4 x half> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fneg <vscale x 8 x half> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fneg <vscale x 16 x half> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fneg <vscale x 2 x float> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fneg <vscale x 4 x float> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fneg <vscale x 8 x float> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:1 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fneg <vscale x 2 x double> poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fneg <vscale x 4 x double> poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V2F16 = fneg <vscale x 2 x half> undef
- %V4F16 = fneg <vscale x 4 x half> undef
- %V8F16 = fneg <vscale x 8 x half> undef
- %V16F16 = fneg <vscale x 16 x half> undef
+ %V2F16 = fneg <vscale x 2 x half> poison
+ %V4F16 = fneg <vscale x 4 x half> poison
+ %V8F16 = fneg <vscale x 8 x half> poison
+ %V16F16 = fneg <vscale x 16 x half> poison
- %V2F32 = fneg <vscale x 2 x float> undef
- %V4F32 = fneg <vscale x 4 x float> undef
- %V8F32 = fneg <vscale x 8 x float> undef
+ %V2F32 = fneg <vscale x 2 x float> poison
+ %V4F32 = fneg <vscale x 4 x float> poison
+ %V8F32 = fneg <vscale x 8 x float> poison
- %V2F64 = fneg <vscale x 2 x double> undef
- %V4F64 = fneg <vscale x 4 x double> undef
+ %V2F64 = fneg <vscale x 2 x double> poison
+ %V4F64 = fneg <vscale x 4 x double> poison
ret void
}
define void @fmul() {
; CHECK-LABEL: 'fmul'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = fmul <vscale x 4 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = fmul <vscale x 8 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = fmul <vscale x 16 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = fmul <vscale x 2 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = fmul <vscale x 4 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = fmul <vscale x 8 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = fmul <vscale x 2 x double> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = fmul <vscale x 4 x double> poison, poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = fmul <vscale x 4 x half> undef, undef
- %V8F16 = fmul <vscale x 8 x half> undef, undef
- %V16F16 = fmul <vscale x 16 x half> undef, undef
+ %V4F16 = fmul <vscale x 4 x half> poison, poison
+ %V8F16 = fmul <vscale x 8 x half> poison, poison
+ %V16F16 = fmul <vscale x 16 x half> poison, poison
- %V2F32 = fmul <vscale x 2 x float> undef, undef
- %V4F32 = fmul <vscale x 4 x float> undef, undef
- %V8F32 = fmul <vscale x 8 x float> undef, undef
+ %V2F32 = fmul <vscale x 2 x float> poison, poison
+ %V4F32 = fmul <vscale x 4 x float> poison, poison
+ %V8F32 = fmul <vscale x 8 x float> poison, poison
- %V2F64 = fmul <vscale x 2 x double> undef, undef
- %V4F64 = fmul <vscale x 4 x double> undef, undef
+ %V2F64 = fmul <vscale x 2 x double> poison, poison
+ %V4F64 = fmul <vscale x 4 x double> poison, poison
ret void
}
define void @fdiv() {
; CHECK-LABEL: 'fdiv'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = fdiv <vscale x 4 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = fdiv <vscale x 8 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16F16 = fdiv <vscale x 16 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = fdiv <vscale x 2 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = fdiv <vscale x 4 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8F32 = fdiv <vscale x 8 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = fdiv <vscale x 2 x double> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4F64 = fdiv <vscale x 4 x double> poison, poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = fdiv <vscale x 4 x half> undef, undef
- %V8F16 = fdiv <vscale x 8 x half> undef, undef
- %V16F16 = fdiv <vscale x 16 x half> undef, undef
+ %V4F16 = fdiv <vscale x 4 x half> poison, poison
+ %V8F16 = fdiv <vscale x 8 x half> poison, poison
+ %V16F16 = fdiv <vscale x 16 x half> poison, poison
- %V2F32 = fdiv <vscale x 2 x float> undef, undef
- %V4F32 = fdiv <vscale x 4 x float> undef, undef
- %V8F32 = fdiv <vscale x 8 x float> undef, undef
+ %V2F32 = fdiv <vscale x 2 x float> poison, poison
+ %V4F32 = fdiv <vscale x 4 x float> poison, poison
+ %V8F32 = fdiv <vscale x 8 x float> poison, poison
- %V2F64 = fdiv <vscale x 2 x double> undef, undef
- %V4F64 = fdiv <vscale x 4 x double> undef, undef
+ %V2F64 = fdiv <vscale x 2 x double> poison, poison
+ %V4F64 = fdiv <vscale x 4 x double> poison, poison
ret void
}
define void @frem() {
; CHECK-LABEL: 'frem'
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> undef, undef
-; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> undef, undef
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F16 = frem <vscale x 4 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F16 = frem <vscale x 8 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V16F16 = frem <vscale x 16 x half> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F32 = frem <vscale x 2 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F32 = frem <vscale x 4 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V8F32 = frem <vscale x 8 x float> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V2F64 = frem <vscale x 2 x double> poison, poison
+; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %V4F64 = frem <vscale x 4 x double> poison, poison
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = frem <vscale x 4 x half> undef, undef
- %V8F16 = frem <vscale x 8 x half> undef, undef
- %V16F16 = frem <vscale x 16 x half> undef, undef
+ %V4F16 = frem <vscale x 4 x half> poison, poison
+ %V8F16 = frem <vscale x 8 x half> poison, poison
+ %V16F16 = frem <vscale x 16 x half> poison, poison
- %V2F32 = frem <vscale x 2 x float> undef, undef
- %V4F32 = frem <vscale x 4 x float> undef, undef
- %V8F32 = frem <vscale x 8 x float> undef, undef
+ %V2F32 = frem <vscale x 2 x float> poison, poison
+ %V4F32 = frem <vscale x 4 x float> poison, poison
+ %V8F32 = frem <vscale x 8 x float> poison, poison
- %V2F64 = frem <vscale x 2 x double> undef, undef
- %V4F64 = frem <vscale x 4 x double> undef, undef
+ %V2F64 = frem <vscale x 2 x double> poison, poison
+ %V4F64 = frem <vscale x 4 x double> poison, poison
ret void
}
define void @fma() {
; CHECK-LABEL: 'fma'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fma.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fma.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fma.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fma.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fma.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fma.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fma.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fma.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
- %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
- %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+ %V4F16 = call <vscale x 4 x half> @llvm.fma.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+ %V8F16 = call <vscale x 8 x half> @llvm.fma.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+ %V16F16 = call <vscale x 16 x half> @llvm.fma.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
- %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
- %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
- %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+ %V2F32 = call <vscale x 2 x float> @llvm.fma.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+ %V4F32 = call <vscale x 4 x float> @llvm.fma.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+ %V8F32 = call <vscale x 8 x float> @llvm.fma.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
- %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
- %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+ %V2F64 = call <vscale x 2 x double> @llvm.fma.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+ %V4F64 = call <vscale x 4 x double> @llvm.fma.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
ret void
}
define void @fmuladd() {
; CHECK-LABEL: 'fmuladd'
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.nxv4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.nxv16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:1 Lat:3 SizeLat:1 for: %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.nxv2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+; CHECK-NEXT: Cost Model: Found costs of RThru:4 CodeSize:1 Lat:3 SizeLat:1 for: %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.nxv4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
;
- %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef, <vscale x 4 x half> undef)
- %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef, <vscale x 8 x half> undef)
- %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef, <vscale x 16 x half> undef)
+ %V4F16 = call <vscale x 4 x half> @llvm.fmuladd.v4f16(<vscale x 4 x half> poison, <vscale x 4 x half> poison, <vscale x 4 x half> poison)
+ %V8F16 = call <vscale x 8 x half> @llvm.fmuladd.v8f16(<vscale x 8 x half> poison, <vscale x 8 x half> poison, <vscale x 8 x half> poison)
+ %V16F16 = call <vscale x 16 x half> @llvm.fmuladd.v16f16(<vscale x 16 x half> poison, <vscale x 16 x half> poison, <vscale x 16 x half> poison)
- %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef)
- %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef)
- %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef)
+ %V2F32 = call <vscale x 2 x float> @llvm.fmuladd.v2f32(<vscale x 2 x float> poison, <vscale x 2 x float> poison, <vscale x 2 x float> poison)
+ %V4F32 = call <vscale x 4 x float> @llvm.fmuladd.v4f32(<vscale x 4 x float> poison, <vscale x 4 x float> poison, <vscale x 4 x float> poison)
+ %V8F32 = call <vscale x 8 x float> @llvm.fmuladd.v8f32(<vscale x 8 x float> poison, <vscale x 8 x float> poison, <vscale x 8 x float> poison)
- %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef)
- %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef)
+ %V2F64 = call <vscale x 2 x double> @llvm.fmuladd.v2f64(<vscale x 2 x double> poison, <vscale x 2 x double> poison, <vscale x 2 x double> poison)
+ %V4F64 = call <vscale x 4 x double> @llvm.fmuladd.v4f64(<vscale x 4 x double> poison, <vscale x 4 x double> poison, <vscale x 4 x double> poison)
ret void
}
diff --git a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
index 7411dc9..df42c75 100644
--- a/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/monotonicity-no-wrap-flags.ll
@@ -298,7 +298,8 @@ exit:
}
; The value of step reccurence is not invariant with respect to the outer most
-; loop (the i-loop).
+; loop (the i-loop). It is theoretically multivariate monotonic by definition,
+; but we cannot handle non-affine addrec for now.
;
; offset_i = 0;
; for (int i = 0; i < 100; i++) {
@@ -312,7 +313,8 @@ define void @step_is_variant(ptr %a) {
; CHECK-NEXT: Monotonicity check:
; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
; CHECK-NEXT: Expr: {%offset.i,+,1}<nuw><nsw><%loop.j>
-; CHECK-NEXT: Monotonicity: MultivariateSignedMonotonic
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: %offset.i
; CHECK-EMPTY:
; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
; CHECK-NEXT: da analyze - confused!
@@ -346,6 +348,56 @@ exit:
ret void
}
+; The value of step reccurence is not invariant with respect to the outer most
+; loop (the i-loop). Actually, `offset_i` is not monotonic.
+;
+; offset_i = 0;
+; for (int i = 0; i < 100; i++) {
+; for (int j = 0; j < 100; j++)
+; a[offset_i + j] = 0;
+; offset_i += (i % 2 == 0) ? -1 : 3;
+; }
+;
+define void @step_is_variant2(ptr %a) {
+; CHECK-LABEL: 'step_is_variant2'
+; CHECK-NEXT: Monotonicity check:
+; CHECK-NEXT: Inst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: Expr: {%offset.i,+,1}<nsw><%loop.j>
+; CHECK-NEXT: Monotonicity: Unknown
+; CHECK-NEXT: Reason: %offset.i
+; CHECK-EMPTY:
+; CHECK-NEXT: Src: store i8 0, ptr %idx, align 1 --> Dst: store i8 0, ptr %idx, align 1
+; CHECK-NEXT: da analyze - confused!
+;
+entry:
+ br label %loop.i.header
+
+loop.i.header:
+ %i = phi i64 [ 0, %entry ], [ %i.inc, %loop.i.latch ]
+ %offset.i = phi i64 [ 0, %entry ], [ %offset.i.next, %loop.i.latch ]
+ %step.i.0 = phi i64 [ -1, %entry ], [ %step.i.1, %loop.i.latch ]
+ %step.i.1 = phi i64 [ 3, %entry ], [ %step.i.0, %loop.i.latch ]
+ br label %loop.j
+
+loop.j:
+ %j = phi i64 [ 0, %loop.i.header ], [ %j.inc, %loop.j ]
+ %offset = add nsw i64 %offset.i, %j
+ %idx = getelementptr inbounds i8, ptr %a, i64 %offset
+ store i8 0, ptr %idx
+ %j.inc = add nsw i64 %j, 1
+ %exitcond.j = icmp eq i64 %j.inc, 100
+ br i1 %exitcond.j, label %loop.i.latch, label %loop.j
+
+loop.i.latch:
+ %i.inc = add nsw i64 %i, 1
+ %offset.i.next = add nsw i64 %offset.i, %step.i.0
+ %exitcond.i = icmp eq i64 %i.inc, 100
+ br i1 %exitcond.i, label %exit, label %loop.i.header
+
+exit:
+ ret void
+}
+
; The AddRec doesn't have nsw flag for the j-loop, since the store may not be
; executed.
;
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index f01422e..e547c34 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -30,6 +30,7 @@ llvm_canonicalize_cmake_booleans(
LLVM_INCLUDE_SPIRV_TOOLS_TESTS
LLVM_APPEND_VC_REV
LLVM_HAS_LOGF128
+ LLVM_ENABLE_ONDISK_CAS
)
configure_lit_site_cfg(
@@ -81,6 +82,7 @@ set(LLVM_TEST_DEPENDS
llvm-bcanalyzer
llvm-bitcode-strip
llvm-c-test
+ llvm-cas
llvm-cat
llvm-cfi-verify
llvm-cgdata
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
new file mode 100644
index 0000000..8d1abdd
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp16.ll
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f16mm -global-isel < %s | FileCheck %s
+
+define <8 x half> @fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v8f16.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmmla v0.8h, v1.16b, v2.16b
+; CHECK-NEXT: ret
+entry:
+ %vfmmla1.i = tail call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v16i8(<8 x half> %r, <16 x i8> %a, <16 x i8> %b) #3
+ ret <8 x half> %vfmmla1.i
+}
+
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
new file mode 100644
index 0000000..4c33567
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-matmul-fp32.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+neon,+f8f32mm -global-isel < %s | FileCheck %s
+
+define <4 x float> @fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: fmmla.v4f32.v16i8:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fmmla v0.4s, v1.16b, v2.16b
+; CHECK-NEXT: ret
+entry:
+ %vfmmla1.i = tail call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v16i8(<4 x float> %r, <16 x i8> %a, <16 x i8> %b) #3
+ ret <4 x float> %vfmmla1.i
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
new file mode 100644
index 0000000..9dbe096
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fdot.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+; RUN: llc -global-isel -global-isel-abort=2 -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
+
+define <vscale x 4 x float> @fdot_wide_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; SVE2-LABEL: fdot_wide_nxv4f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: uunpklo z3.s, z1.h
+; SVE2-NEXT: uunpklo z4.s, z2.h
+; SVE2-NEXT: ptrue p0.s
+; SVE2-NEXT: uunpkhi z1.s, z1.h
+; SVE2-NEXT: uunpkhi z2.s, z2.h
+; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT: fcvt z4.s, p0/m, z4.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fmul z3.s, z3.s, z4.s
+; SVE2-NEXT: fmul z1.s, z1.s, z2.s
+; SVE2-NEXT: fadd z0.s, z0.s, z3.s
+; SVE2-NEXT: fadd z0.s, z0.s, z1.s
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_wide_nxv4f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: ret
+entry:
+ %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+ %b.wide = fpext <vscale x 8 x half> %b to <vscale x 8 x float>
+ %mult = fmul <vscale x 8 x float> %a.wide, %b.wide
+ %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %mult)
+ ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 4 x float> @fdot_splat_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x half> %a) {
+; SVE2-LABEL: fdot_splat_nxv4f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: uunpklo z2.s, z1.h
+; SVE2-NEXT: ptrue p0.s
+; SVE2-NEXT: uunpkhi z1.s, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fadd z0.s, z0.s, z2.s
+; SVE2-NEXT: fadd z0.s, z0.s, z1.s
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_splat_nxv4f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: fmov z2.h, #1.00000000
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: ret
+entry:
+ %a.wide = fpext <vscale x 8 x half> %a to <vscale x 8 x float>
+ %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a.wide)
+ ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 8 x half> @partial_reduce_nxv8f16(<vscale x 8 x half> %acc, <vscale x 16 x half> %a) {
+; CHECK-LABEL: partial_reduce_nxv8f16:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd z0.h, z0.h, z1.h
+; CHECK-NEXT: fadd z0.h, z0.h, z2.h
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 8 x half> @llvm.vector.partial.reduce.fadd(<vscale x 8 x half> %acc, <vscale x 16 x half> %a)
+ ret <vscale x 8 x half> %partial.reduce
+}
+
+define <vscale x 4 x float> @partial_reduce_nxv4f32(<vscale x 4 x float> %acc, <vscale x 8 x float> %a) {
+; CHECK-LABEL: partial_reduce_nxv4f32:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd z0.s, z0.s, z1.s
+; CHECK-NEXT: fadd z0.s, z0.s, z2.s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 4 x float> @llvm.vector.partial.reduce.fadd(<vscale x 4 x float> %acc, <vscale x 8 x float> %a)
+ ret <vscale x 4 x float> %partial.reduce
+}
+
+define <vscale x 2 x double> @partial_reduce_nxv2f64(<vscale x 2 x double> %acc, <vscale x 4 x double> %a) {
+; CHECK-LABEL: partial_reduce_nxv2f64:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd z0.d, z0.d, z1.d
+; CHECK-NEXT: fadd z0.d, z0.d, z2.d
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <vscale x 2 x double> @llvm.vector.partial.reduce.fadd(<vscale x 2 x double> %acc, <vscale x 4 x double> %a)
+ ret <vscale x 2 x double> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
new file mode 100644
index 0000000..89216ce
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2p1-fixed-length-fdot.ll
@@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=CHECK,SVE2
+; RUN: llc -mattr=+sve2p1 < %s | FileCheck %s --check-prefixes=CHECK,SVE2P1
+
+target triple = "aarch64-linux-gnu"
+
+define void @fdot_wide_v8f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(2,0) {
+; SVE2-LABEL: fdot_wide_v8f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.s, vl8
+; SVE2-NEXT: mov x8, #8 // =0x8
+; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT: fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_wide_v8f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: ptrue p0.s, vl8
+; SVE2P1-NEXT: ptrue p1.h, vl16
+; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT: ret
+entry:
+ %acc = load <8 x float>, ptr %accptr
+ %a = load <16 x half>, ptr %aptr
+ %b = load <16 x half>, ptr %bptr
+ %a.wide = fpext <16 x half> %a to <16 x float>
+ %b.wide = fpext <16 x half> %b to <16 x float>
+ %mult = fmul <16 x float> %a.wide, %b.wide
+ %partial.reduce = call <8 x float> @llvm.vector.partial.reduce.fadd(<8 x float> %acc, <16 x float> %mult)
+ store <8 x float> %partial.reduce, ptr %accptr
+ ret void
+}
+
+define void @fdot_wide_v16f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(4,0) {
+; SVE2-LABEL: fdot_wide_v16f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.s, vl16
+; SVE2-NEXT: mov x8, #16 // =0x10
+; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT: fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_wide_v16f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: ptrue p0.s, vl16
+; SVE2P1-NEXT: ptrue p1.h, vl32
+; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT: ret
+entry:
+ %acc = load <16 x float>, ptr %accptr
+ %a = load <32 x half>, ptr %aptr
+ %b = load <32 x half>, ptr %bptr
+ %a.wide = fpext <32 x half> %a to <32 x float>
+ %b.wide = fpext <32 x half> %b to <32 x float>
+ %mult = fmul <32 x float> %a.wide, %b.wide
+ %partial.reduce = call <16 x float> @llvm.vector.partial.reduce.fadd(<16 x float> %acc, <32 x float> %mult)
+ store <16 x float> %partial.reduce, ptr %accptr
+ ret void
+}
+
+define void @fdot_wide_v32f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(8,0) {
+; SVE2-LABEL: fdot_wide_v32f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.s, vl32
+; SVE2-NEXT: mov x8, #32 // =0x20
+; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT: fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_wide_v32f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: ptrue p0.s, vl32
+; SVE2P1-NEXT: ptrue p1.h, vl64
+; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT: ret
+entry:
+ %acc = load <32 x float>, ptr %accptr
+ %a = load <64 x half>, ptr %aptr
+ %b = load <64 x half>, ptr %bptr
+ %a.wide = fpext <64 x half> %a to <64 x float>
+ %b.wide = fpext <64 x half> %b to <64 x float>
+ %mult = fmul <64 x float> %a.wide, %b.wide
+ %partial.reduce = call <32 x float> @llvm.vector.partial.reduce.fadd(<32 x float> %acc, <64 x float> %mult)
+ store <32 x float> %partial.reduce, ptr %accptr
+ ret void
+}
+
+define void @fdot_wide_v64f32(ptr %accptr, ptr %aptr, ptr %bptr) vscale_range(16,0) {
+; SVE2-LABEL: fdot_wide_v64f32:
+; SVE2: // %bb.0: // %entry
+; SVE2-NEXT: ptrue p0.s, vl64
+; SVE2-NEXT: mov x8, #64 // =0x40
+; SVE2-NEXT: ld1h { z0.s }, p0/z, [x1]
+; SVE2-NEXT: ld1h { z1.s }, p0/z, [x2]
+; SVE2-NEXT: ld1h { z2.s }, p0/z, [x1, x8, lsl #1]
+; SVE2-NEXT: ld1h { z3.s }, p0/z, [x2, x8, lsl #1]
+; SVE2-NEXT: fcvt z0.s, p0/m, z0.h
+; SVE2-NEXT: fcvt z1.s, p0/m, z1.h
+; SVE2-NEXT: fcvt z2.s, p0/m, z2.h
+; SVE2-NEXT: fcvt z3.s, p0/m, z3.h
+; SVE2-NEXT: fmul z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: ld1w { z1.s }, p0/z, [x0]
+; SVE2-NEXT: fmul z2.s, p0/m, z2.s, z3.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z1.s
+; SVE2-NEXT: fadd z0.s, p0/m, z0.s, z2.s
+; SVE2-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2-NEXT: ret
+;
+; SVE2P1-LABEL: fdot_wide_v64f32:
+; SVE2P1: // %bb.0: // %entry
+; SVE2P1-NEXT: ptrue p0.s, vl64
+; SVE2P1-NEXT: ptrue p1.h, vl128
+; SVE2P1-NEXT: ld1w { z0.s }, p0/z, [x0]
+; SVE2P1-NEXT: ld1h { z1.h }, p1/z, [x1]
+; SVE2P1-NEXT: ld1h { z2.h }, p1/z, [x2]
+; SVE2P1-NEXT: fdot z0.s, z1.h, z2.h
+; SVE2P1-NEXT: st1w { z0.s }, p0, [x0]
+; SVE2P1-NEXT: ret
+entry:
+ %acc = load <64 x float>, ptr %accptr
+ %a = load <128 x half>, ptr %aptr
+ %b = load <128 x half>, ptr %bptr
+ %a.wide = fpext <128 x half> %a to <128 x float>
+ %b.wide = fpext <128 x half> %b to <128 x float>
+ %mult = fmul <128 x float> %a.wide, %b.wide
+ %partial.reduce = call <64 x float> @llvm.vector.partial.reduce.fadd(<64 x float> %acc, <128 x float> %mult)
+ store <64 x float> %partial.reduce, ptr %accptr
+ ret void
+}
+
+define <4 x float> @fixed_fdot_wide(<4 x float> %acc, <8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: fixed_fdot_wide:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fcvtl v3.4s, v1.4h
+; CHECK-NEXT: fcvtl v4.4s, v2.4h
+; CHECK-NEXT: fcvtl2 v1.4s, v1.8h
+; CHECK-NEXT: fcvtl2 v2.4s, v2.8h
+; CHECK-NEXT: fmul v3.4s, v3.4s, v4.4s
+; CHECK-NEXT: fmul v1.4s, v1.4s, v2.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v3.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: ret
+entry:
+ %a.wide = fpext <8 x half> %a to <8 x float>
+ %b.wide = fpext <8 x half> %b to <8 x float>
+ %mult = fmul <8 x float> %a.wide, %b.wide
+ %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %mult)
+ ret <4 x float> %partial.reduce
+}
+
+define <8 x half> @partial_reduce_half(<8 x half> %acc, <16 x half> %a) {
+; CHECK-LABEL: partial_reduce_half:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd v0.8h, v0.8h, v1.8h
+; CHECK-NEXT: fadd v0.8h, v0.8h, v2.8h
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <8 x half> @llvm.vector.partial.reduce.fadd(<8 x half> %acc, <16 x half> %a)
+ ret <8 x half> %partial.reduce
+}
+
+define <4 x float> @partial_reduce_float(<4 x float> %acc, <8 x float> %a) {
+; CHECK-LABEL: partial_reduce_float:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd v0.4s, v0.4s, v1.4s
+; CHECK-NEXT: fadd v0.4s, v0.4s, v2.4s
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <4 x float> @llvm.vector.partial.reduce.fadd(<4 x float> %acc, <8 x float> %a)
+ ret <4 x float> %partial.reduce
+}
+
+define <2 x double> @partial_reduce_double(<2 x double> %acc, <4 x double> %a) {
+; CHECK-LABEL: partial_reduce_double:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: fadd v0.2d, v0.2d, v1.2d
+; CHECK-NEXT: fadd v0.2d, v0.2d, v2.2d
+; CHECK-NEXT: ret
+entry:
+ %partial.reduce = call <2 x double> @llvm.vector.partial.reduce.fadd(<2 x double> %acc, <4 x double> %a)
+ ret <2 x double> %partial.reduce
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index 5720b88..cc21305 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
+; RUN: llc -verify-machineinstrs -global-isel -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 51df8c3..54b1554 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7772,7 +7772,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_lshl_b64 s[0:1], 0x1000, s0
; GFX6-NEXT: s_ashr_i32 s8, s1, 31
@@ -7782,8 +7781,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GFX6-NEXT: s_sub_u32 s12, 0, s10
-; GFX6-NEXT: s_subb_u32 s13, 0, s11
+; GFX6-NEXT: s_sub_u32 s0, 0, s10
+; GFX6-NEXT: s_subb_u32 s1, 0, s11
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -7792,128 +7791,121 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_mul_i32 s15, s13, s0
-; GFX6-NEXT: s_mul_i32 s16, s12, s0
-; GFX6-NEXT: s_add_i32 s1, s17, s1
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT: s_add_i32 s1, s1, s15
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
-; GFX6-NEXT: s_mul_i32 s17, s0, s1
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT: s_add_u32 s15, s15, s17
-; GFX6-NEXT: v_readfirstlane_b32 s17, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s17
-; GFX6-NEXT: s_mul_i32 s16, s14, s16
-; GFX6-NEXT: v_readfirstlane_b32 s18, v4
-; GFX6-NEXT: s_add_u32 s15, s15, s16
-; GFX6-NEXT: s_addc_u32 s15, s17, s18
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: s_addc_u32 s16, s16, 0
-; GFX6-NEXT: s_mul_i32 s1, s14, s1
-; GFX6-NEXT: s_add_u32 s1, s15, s1
-; GFX6-NEXT: s_addc_u32 s15, 0, s16
-; GFX6-NEXT: s_add_u32 s16, s0, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s14, s14, s15
-; GFX6-NEXT: s_mul_i32 s0, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s1, v0
-; GFX6-NEXT: s_add_i32 s0, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s13, s16
-; GFX6-NEXT: s_mul_i32 s1, s12, s16
-; GFX6-NEXT: s_add_i32 s0, s0, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s0
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_mul_i32 s1, s14, s1
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s1, s13, s1
-; GFX6-NEXT: s_addc_u32 s1, s15, s12
+; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
-; GFX6-NEXT: s_mul_i32 s0, s14, s0
-; GFX6-NEXT: s_add_u32 s0, s1, s0
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s15, s16, s0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s14, s14, s12
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s13, s0, s12
+; GFX6-NEXT: v_readfirstlane_b32 s16, v2
+; GFX6-NEXT: s_mul_i32 s14, s1, s2
+; GFX6-NEXT: s_mul_i32 s15, s0, s2
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s15
+; GFX6-NEXT: s_add_i32 s13, s13, s14
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s15
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_mul_i32 s16, s2, s13
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_add_u32 s14, s14, s16
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s15, s12, s15
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s17, v4
+; GFX6-NEXT: s_add_u32 s14, s14, s15
+; GFX6-NEXT: s_addc_u32 s14, s16, s17
+; GFX6-NEXT: v_readfirstlane_b32 s15, v1
+; GFX6-NEXT: s_addc_u32 s15, s15, 0
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_add_u32 s13, s14, s13
+; GFX6-NEXT: s_addc_u32 s14, 0, s15
+; GFX6-NEXT: s_add_u32 s13, s2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, s14
+; GFX6-NEXT: s_mul_i32 s14, s0, s12
+; GFX6-NEXT: s_mul_i32 s1, s1, s13
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_add_i32 s14, s15, s14
+; GFX6-NEXT: s_mul_i32 s0, s0, s13
+; GFX6-NEXT: s_add_i32 s1, s14, s1
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT: s_mul_i32 s15, s13, s1
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s15, s17, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s0, s12, s0
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_add_u32 s0, s15, s0
+; GFX6-NEXT: s_addc_u32 s0, s16, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_addc_u32 s14, s14, 0
+; GFX6-NEXT: s_mul_i32 s1, s12, s1
+; GFX6-NEXT: s_add_u32 s0, s0, s1
+; GFX6-NEXT: s_addc_u32 s1, 0, s14
+; GFX6-NEXT: s_add_u32 s14, s13, s0
+; GFX6-NEXT: s_addc_u32 s15, s12, s1
; GFX6-NEXT: s_ashr_i32 s12, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s12
; GFX6-NEXT: s_mov_b32 s13, s12
; GFX6-NEXT: s_addc_u32 s1, s7, s12
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s15
+; GFX6-NEXT: v_mov_b32_e32 v2, s14
; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT: s_mul_i32 s1, s6, s14
+; GFX6-NEXT: s_mul_i32 s1, s6, s15
; GFX6-NEXT: v_readfirstlane_b32 s16, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
; GFX6-NEXT: s_add_u32 s1, s16, s1
; GFX6-NEXT: s_addc_u32 s4, 0, s4
-; GFX6-NEXT: s_mul_i32 s15, s7, s15
+; GFX6-NEXT: s_mul_i32 s14, s7, s14
; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: s_add_u32 s1, s1, s15
+; GFX6-NEXT: s_add_u32 s1, s1, s14
; GFX6-NEXT: s_addc_u32 s1, s4, s16
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
-; GFX6-NEXT: s_mul_i32 s14, s7, s14
-; GFX6-NEXT: s_add_u32 s16, s1, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: s_mul_i32 s14, s7, s15
+; GFX6-NEXT: s_add_u32 s14, s1, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_addc_u32 s17, 0, s4
+; GFX6-NEXT: s_addc_u32 s15, 0, s4
; GFX6-NEXT: s_mov_b32 s1, s5
-; GFX6-NEXT: s_mul_i32 s4, s10, s17
+; GFX6-NEXT: s_mul_i32 s4, s10, s15
; GFX6-NEXT: v_readfirstlane_b32 s5, v0
; GFX6-NEXT: s_add_i32 s4, s5, s4
-; GFX6-NEXT: s_mul_i32 s5, s11, s16
-; GFX6-NEXT: s_add_i32 s18, s4, s5
-; GFX6-NEXT: s_sub_i32 s14, s7, s18
-; GFX6-NEXT: s_mul_i32 s4, s10, s16
+; GFX6-NEXT: s_mul_i32 s5, s11, s14
+; GFX6-NEXT: s_add_i32 s16, s4, s5
+; GFX6-NEXT: s_sub_i32 s17, s7, s16
+; GFX6-NEXT: s_mul_i32 s4, s10, s14
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s4, s5
-; GFX6-NEXT: s_subb_u32 s19, s14, s11
-; GFX6-NEXT: s_sub_u32 s20, s6, s10
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s19, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s20, s10
+; GFX6-NEXT: s_subb_u32 s17, s17, s11
+; GFX6-NEXT: s_sub_u32 s18, s6, s10
+; GFX6-NEXT: s_subb_u32 s17, s17, 0
+; GFX6-NEXT: s_cmp_ge_u32 s17, s11
; GFX6-NEXT: s_cselect_b32 s19, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s11
-; GFX6-NEXT: s_cselect_b32 s14, s19, s15
-; GFX6-NEXT: s_add_u32 s15, s16, 1
-; GFX6-NEXT: s_addc_u32 s19, s17, 0
-; GFX6-NEXT: s_add_u32 s20, s16, 2
-; GFX6-NEXT: s_addc_u32 s21, s17, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s20, s15
-; GFX6-NEXT: s_cselect_b32 s15, s21, s19
+; GFX6-NEXT: s_cmp_ge_u32 s18, s10
+; GFX6-NEXT: s_cselect_b32 s18, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s17, s11
+; GFX6-NEXT: s_cselect_b32 s17, s18, s19
+; GFX6-NEXT: s_add_u32 s18, s14, 1
+; GFX6-NEXT: s_addc_u32 s19, s15, 0
+; GFX6-NEXT: s_add_u32 s20, s14, 2
+; GFX6-NEXT: s_addc_u32 s21, s15, 0
+; GFX6-NEXT: s_cmp_lg_u32 s17, 0
+; GFX6-NEXT: s_cselect_b32 s17, s20, s18
+; GFX6-NEXT: s_cselect_b32 s18, s21, s19
; GFX6-NEXT: s_or_b32 s4, s4, s5
-; GFX6-NEXT: s_subb_u32 s4, s7, s18
+; GFX6-NEXT: s_subb_u32 s4, s7, s16
; GFX6-NEXT: s_cmp_ge_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s5, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s6, s10
@@ -7921,13 +7913,14 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_cmp_eq_u32 s4, s11
; GFX6-NEXT: s_cselect_b32 s4, s6, s5
; GFX6-NEXT: s_cmp_lg_u32 s4, 0
-; GFX6-NEXT: s_cselect_b32 s5, s15, s17
-; GFX6-NEXT: s_cselect_b32 s4, s14, s16
+; GFX6-NEXT: s_cselect_b32 s5, s18, s15
+; GFX6-NEXT: s_cselect_b32 s4, s17, s14
; GFX6-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_sub_u32 s4, s4, s6
; GFX6-NEXT: s_subb_u32 s5, s5, s7
; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -8278,8 +8271,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT: s_sub_u32 s14, 0, s6
-; GFX6-NEXT: s_subb_u32 s15, 0, s7
+; GFX6-NEXT: s_sub_u32 s12, 0, s6
+; GFX6-NEXT: s_subb_u32 s13, 0, s7
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8288,69 +8281,65 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s14, v0
-; GFX6-NEXT: v_readfirstlane_b32 s16, v1
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_mul_i32 s13, s14, s16
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v0
+; GFX6-NEXT: s_mul_i32 s16, s12, s14
; GFX6-NEXT: v_readfirstlane_b32 s19, v2
-; GFX6-NEXT: s_mul_i32 s17, s15, s12
-; GFX6-NEXT: s_mul_i32 s18, s14, s12
-; GFX6-NEXT: s_add_i32 s13, s19, s13
+; GFX6-NEXT: s_mul_i32 s17, s13, s15
+; GFX6-NEXT: s_mul_i32 s18, s12, s15
+; GFX6-NEXT: s_add_i32 s16, s19, s16
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s18
-; GFX6-NEXT: s_add_i32 s13, s13, s17
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
+; GFX6-NEXT: s_add_i32 s16, s16, s17
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s16
; GFX6-NEXT: v_mul_hi_u32 v4, v1, s18
; GFX6-NEXT: v_readfirstlane_b32 s17, v3
-; GFX6-NEXT: s_mul_i32 s20, s12, s13
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_mul_i32 s20, s15, s16
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s16
; GFX6-NEXT: s_add_u32 s17, s17, s20
; GFX6-NEXT: v_readfirstlane_b32 s20, v0
-; GFX6-NEXT: s_mul_i32 s18, s16, s18
+; GFX6-NEXT: s_mul_i32 s18, s14, s18
; GFX6-NEXT: s_addc_u32 s20, 0, s20
; GFX6-NEXT: v_readfirstlane_b32 s19, v4
; GFX6-NEXT: s_add_u32 s17, s17, s18
; GFX6-NEXT: s_addc_u32 s17, s20, s19
; GFX6-NEXT: v_readfirstlane_b32 s18, v1
; GFX6-NEXT: s_addc_u32 s18, s18, 0
-; GFX6-NEXT: s_mul_i32 s13, s16, s13
-; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: s_add_u32 s16, s17, s16
; GFX6-NEXT: s_addc_u32 s17, 0, s18
-; GFX6-NEXT: s_add_u32 s18, s12, s13
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_addc_u32 s16, s16, s17
-; GFX6-NEXT: s_mul_i32 s12, s14, s16
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s15, s15, s18
-; GFX6-NEXT: s_mul_i32 s13, s14, s18
-; GFX6-NEXT: s_add_i32 s12, s12, s15
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
-; GFX6-NEXT: v_mul_hi_u32 v3, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s18, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s18, v0
-; GFX6-NEXT: s_mul_i32 s15, s18, s12
-; GFX6-NEXT: v_readfirstlane_b32 s19, v2
-; GFX6-NEXT: s_add_u32 s15, s19, s15
+; GFX6-NEXT: s_add_u32 s15, s15, s16
+; GFX6-NEXT: v_mov_b32_e32 v0, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_addc_u32 s14, s14, s17
+; GFX6-NEXT: s_mul_i32 s16, s12, s14
+; GFX6-NEXT: s_mul_i32 s13, s13, s15
; GFX6-NEXT: v_readfirstlane_b32 s17, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s13
-; GFX6-NEXT: s_addc_u32 s17, 0, s17
-; GFX6-NEXT: v_readfirstlane_b32 s14, v3
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: s_addc_u32 s13, s17, s14
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_addc_u32 s14, s14, 0
-; GFX6-NEXT: s_mul_i32 s12, s16, s12
-; GFX6-NEXT: s_add_u32 s12, s13, s12
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: s_add_u32 s15, s18, s12
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_addc_u32 s14, s16, s14
+; GFX6-NEXT: s_add_i32 s16, s17, s16
+; GFX6-NEXT: s_mul_i32 s12, s12, s15
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s15, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s15, v0
+; GFX6-NEXT: s_mul_i32 s17, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s19, v2
+; GFX6-NEXT: s_add_u32 s17, s19, s17
+; GFX6-NEXT: v_readfirstlane_b32 s18, v0
+; GFX6-NEXT: s_mul_i32 s12, s14, s12
+; GFX6-NEXT: s_addc_u32 s18, 0, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v3
+; GFX6-NEXT: s_add_u32 s12, s17, s12
+; GFX6-NEXT: s_addc_u32 s12, s18, s16
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s13, s14, s13
+; GFX6-NEXT: s_add_u32 s12, s12, s13
+; GFX6-NEXT: s_addc_u32 s13, 0, s16
+; GFX6-NEXT: s_add_u32 s15, s15, s12
+; GFX6-NEXT: s_addc_u32 s14, s14, s13
; GFX6-NEXT: s_ashr_i32 s12, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s12
; GFX6-NEXT: s_mov_b32 s13, s12
@@ -8374,40 +8363,37 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
; GFX6-NEXT: s_addc_u32 s16, s16, 0
; GFX6-NEXT: s_mul_i32 s14, s9, s14
-; GFX6-NEXT: s_add_u32 s18, s15, s14
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: s_add_u32 s17, s15, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s17
; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s16
-; GFX6-NEXT: s_mul_i32 s14, s6, s19
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: s_mul_i32 s14, s6, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
; GFX6-NEXT: s_add_i32 s14, s15, s14
-; GFX6-NEXT: s_mul_i32 s15, s7, s18
-; GFX6-NEXT: s_add_i32 s20, s14, s15
-; GFX6-NEXT: s_sub_i32 s16, s9, s20
-; GFX6-NEXT: s_mul_i32 s14, s6, s18
+; GFX6-NEXT: s_mul_i32 s15, s7, s17
+; GFX6-NEXT: s_add_i32 s18, s14, s15
+; GFX6-NEXT: s_sub_i32 s19, s9, s18
+; GFX6-NEXT: s_mul_i32 s14, s6, s17
; GFX6-NEXT: s_sub_u32 s8, s8, s14
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s14, s15
-; GFX6-NEXT: s_subb_u32 s21, s16, s7
-; GFX6-NEXT: s_sub_u32 s22, s8, s6
-; GFX6-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GFX6-NEXT: s_or_b32 s16, s16, s17
-; GFX6-NEXT: s_subb_u32 s16, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s17, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s6
+; GFX6-NEXT: s_subb_u32 s19, s19, s7
+; GFX6-NEXT: s_sub_u32 s20, s8, s6
+; GFX6-NEXT: s_subb_u32 s19, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s16, s7
-; GFX6-NEXT: s_cselect_b32 s16, s21, s17
-; GFX6-NEXT: s_add_u32 s17, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s16, 0
-; GFX6-NEXT: s_cselect_b32 s16, s22, s17
-; GFX6-NEXT: s_cselect_b32 s17, s23, s21
+; GFX6-NEXT: s_cmp_ge_u32 s20, s6
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s7
+; GFX6-NEXT: s_cselect_b32 s19, s20, s21
+; GFX6-NEXT: s_add_u32 s20, s17, 1
+; GFX6-NEXT: s_addc_u32 s21, s16, 0
+; GFX6-NEXT: s_add_u32 s22, s17, 2
+; GFX6-NEXT: s_addc_u32 s23, s16, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s19, s22, s20
+; GFX6-NEXT: s_cselect_b32 s20, s23, s21
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s9, s9, s20
+; GFX6-NEXT: s_subb_u32 s9, s9, s18
; GFX6-NEXT: s_cmp_ge_u32 s9, s7
; GFX6-NEXT: s_cselect_b32 s14, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s8, s6
@@ -8415,12 +8401,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s9, s7
; GFX6-NEXT: s_cselect_b32 s6, s6, s14
; GFX6-NEXT: s_cmp_lg_u32 s6, 0
-; GFX6-NEXT: s_cselect_b32 s7, s17, s19
-; GFX6-NEXT: s_cselect_b32 s6, s16, s18
+; GFX6-NEXT: s_cselect_b32 s7, s20, s16
+; GFX6-NEXT: s_cselect_b32 s6, s19, s17
; GFX6-NEXT: s_xor_b64 s[2:3], s[12:13], s[2:3]
; GFX6-NEXT: s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT: s_sub_u32 s16, s6, s2
-; GFX6-NEXT: s_subb_u32 s17, s7, s3
+; GFX6-NEXT: s_sub_u32 s14, s6, s2
+; GFX6-NEXT: s_subb_u32 s15, s7, s3
; GFX6-NEXT: s_ashr_i32 s6, s1, 31
; GFX6-NEXT: s_add_u32 s0, s0, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -8428,8 +8414,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[6:7]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT: s_sub_u32 s12, 0, s8
-; GFX6-NEXT: s_subb_u32 s13, 0, s9
+; GFX6-NEXT: s_sub_u32 s2, 0, s8
+; GFX6-NEXT: s_subb_u32 s3, 0, s9
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8438,128 +8424,121 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s3, v2
-; GFX6-NEXT: s_mul_i32 s0, s13, s2
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s15, s12, s2
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s15
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT: s_mul_i32 s4, s2, s3
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s13, s2, s12
+; GFX6-NEXT: v_readfirstlane_b32 s16, v2
+; GFX6-NEXT: s_mul_i32 s1, s3, s0
+; GFX6-NEXT: s_add_i32 s13, s16, s13
+; GFX6-NEXT: s_add_i32 s13, s13, s1
+; GFX6-NEXT: s_mul_i32 s1, s2, s0
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: s_mul_i32 s16, s0, s13
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s15
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT: s_add_u32 s4, s18, s4
-; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s15, s14, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s13
+; GFX6-NEXT: s_add_u32 s16, s18, s16
+; GFX6-NEXT: s_addc_u32 s17, 0, s17
+; GFX6-NEXT: s_mul_i32 s1, s12, s1
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s15
-; GFX6-NEXT: s_addc_u32 s4, s5, s18
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s14, s3
-; GFX6-NEXT: s_add_u32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s4, 0, s5
-; GFX6-NEXT: s_add_u32 s5, s2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s14, s4
-; GFX6-NEXT: s_mul_i32 s2, s12, s4
-; GFX6-NEXT: v_readfirstlane_b32 s3, v0
-; GFX6-NEXT: s_add_i32 s2, s3, s2
-; GFX6-NEXT: s_mul_i32 s13, s13, s5
-; GFX6-NEXT: s_mul_i32 s3, s12, s5
-; GFX6-NEXT: s_add_i32 s2, s2, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_add_u32 s1, s16, s1
+; GFX6-NEXT: s_addc_u32 s1, s17, s18
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
+; GFX6-NEXT: s_addc_u32 s16, s16, 0
+; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: s_add_u32 s1, s1, s13
+; GFX6-NEXT: s_addc_u32 s13, 0, s16
+; GFX6-NEXT: s_add_u32 s16, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
+; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_addc_u32 s4, s12, s13
+; GFX6-NEXT: s_mul_i32 s5, s2, s4
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_add_i32 s5, s12, s5
+; GFX6-NEXT: s_mul_i32 s3, s3, s16
+; GFX6-NEXT: s_mul_i32 s2, s2, s16
+; GFX6-NEXT: s_add_i32 s3, s5, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s3
; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT: s_mul_i32 s13, s5, s2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s13, s15, s13
-; GFX6-NEXT: v_readfirstlane_b32 s14, v0
-; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s3, s13, s3
-; GFX6-NEXT: s_addc_u32 s3, s14, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
+; GFX6-NEXT: s_mul_i32 s12, s16, s3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s12, s17, s12
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
-; GFX6-NEXT: s_add_u32 s2, s3, s2
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s13, s5, s2
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s12, s4, s12
+; GFX6-NEXT: s_addc_u32 s13, 0, s13
+; GFX6-NEXT: v_readfirstlane_b32 s5, v3
+; GFX6-NEXT: s_add_u32 s2, s12, s2
+; GFX6-NEXT: s_addc_u32 s2, s13, s5
+; GFX6-NEXT: v_readfirstlane_b32 s5, v1
+; GFX6-NEXT: s_addc_u32 s5, s5, 0
+; GFX6-NEXT: s_mul_i32 s3, s4, s3
+; GFX6-NEXT: s_add_u32 s2, s2, s3
+; GFX6-NEXT: s_addc_u32 s3, 0, s5
+; GFX6-NEXT: s_add_u32 s12, s16, s2
+; GFX6-NEXT: s_addc_u32 s13, s4, s3
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
; GFX6-NEXT: s_addc_u32 s3, s11, s4
; GFX6-NEXT: s_xor_b64 s[10:11], s[2:3], s[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
-; GFX6-NEXT: s_mul_i32 s2, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_mul_i32 s2, s10, s13
+; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT: v_readfirstlane_b32 s15, v3
+; GFX6-NEXT: v_readfirstlane_b32 s17, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT: s_add_u32 s2, s15, s2
-; GFX6-NEXT: s_addc_u32 s14, 0, s14
-; GFX6-NEXT: s_mul_i32 s13, s11, s13
-; GFX6-NEXT: v_readfirstlane_b32 s15, v1
-; GFX6-NEXT: s_add_u32 s2, s2, s13
-; GFX6-NEXT: s_addc_u32 s2, s14, s15
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_addc_u32 s13, s13, 0
+; GFX6-NEXT: s_add_u32 s2, s17, s2
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
; GFX6-NEXT: s_mul_i32 s12, s11, s12
-; GFX6-NEXT: s_add_u32 s18, s2, s12
-; GFX6-NEXT: v_mov_b32_e32 v0, s18
+; GFX6-NEXT: v_readfirstlane_b32 s17, v1
+; GFX6-NEXT: s_add_u32 s2, s2, s12
+; GFX6-NEXT: s_addc_u32 s2, s16, s17
+; GFX6-NEXT: v_readfirstlane_b32 s12, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s13, s11, s13
+; GFX6-NEXT: s_add_u32 s16, s2, s13
+; GFX6-NEXT: v_mov_b32_e32 v0, s16
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_addc_u32 s19, 0, s13
-; GFX6-NEXT: s_mul_i32 s12, s8, s19
+; GFX6-NEXT: s_addc_u32 s17, 0, s12
+; GFX6-NEXT: s_mul_i32 s12, s8, s17
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_readfirstlane_b32 s13, v0
; GFX6-NEXT: s_add_i32 s12, s13, s12
-; GFX6-NEXT: s_mul_i32 s13, s9, s18
-; GFX6-NEXT: s_add_i32 s20, s12, s13
-; GFX6-NEXT: s_sub_i32 s14, s11, s20
-; GFX6-NEXT: s_mul_i32 s12, s8, s18
+; GFX6-NEXT: s_mul_i32 s13, s9, s16
+; GFX6-NEXT: s_add_i32 s18, s12, s13
+; GFX6-NEXT: s_sub_i32 s19, s11, s18
+; GFX6-NEXT: s_mul_i32 s12, s8, s16
; GFX6-NEXT: s_sub_u32 s10, s10, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
-; GFX6-NEXT: s_subb_u32 s21, s14, s9
-; GFX6-NEXT: s_sub_u32 s22, s10, s8
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s21, 0
-; GFX6-NEXT: s_cmp_ge_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s15, -1, 0
-; GFX6-NEXT: s_cmp_ge_u32 s22, s8
+; GFX6-NEXT: s_subb_u32 s19, s19, s9
+; GFX6-NEXT: s_sub_u32 s20, s10, s8
+; GFX6-NEXT: s_subb_u32 s19, s19, 0
+; GFX6-NEXT: s_cmp_ge_u32 s19, s9
; GFX6-NEXT: s_cselect_b32 s21, -1, 0
-; GFX6-NEXT: s_cmp_eq_u32 s14, s9
-; GFX6-NEXT: s_cselect_b32 s14, s21, s15
-; GFX6-NEXT: s_add_u32 s15, s18, 1
-; GFX6-NEXT: s_addc_u32 s21, s19, 0
-; GFX6-NEXT: s_add_u32 s22, s18, 2
-; GFX6-NEXT: s_addc_u32 s23, s19, 0
-; GFX6-NEXT: s_cmp_lg_u32 s14, 0
-; GFX6-NEXT: s_cselect_b32 s14, s22, s15
-; GFX6-NEXT: s_cselect_b32 s15, s23, s21
+; GFX6-NEXT: s_cmp_ge_u32 s20, s8
+; GFX6-NEXT: s_cselect_b32 s20, -1, 0
+; GFX6-NEXT: s_cmp_eq_u32 s19, s9
+; GFX6-NEXT: s_cselect_b32 s19, s20, s21
+; GFX6-NEXT: s_add_u32 s20, s16, 1
+; GFX6-NEXT: s_addc_u32 s21, s17, 0
+; GFX6-NEXT: s_add_u32 s22, s16, 2
+; GFX6-NEXT: s_addc_u32 s23, s17, 0
+; GFX6-NEXT: s_cmp_lg_u32 s19, 0
+; GFX6-NEXT: s_cselect_b32 s19, s22, s20
+; GFX6-NEXT: s_cselect_b32 s20, s23, s21
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s11, s11, s20
+; GFX6-NEXT: s_subb_u32 s11, s11, s18
; GFX6-NEXT: s_cmp_ge_u32 s11, s9
; GFX6-NEXT: s_cselect_b32 s12, -1, 0
; GFX6-NEXT: s_cmp_ge_u32 s10, s8
@@ -8567,15 +8546,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s11, s9
; GFX6-NEXT: s_cselect_b32 s8, s8, s12
; GFX6-NEXT: s_cmp_lg_u32 s8, 0
-; GFX6-NEXT: s_cselect_b32 s9, s15, s19
-; GFX6-NEXT: s_cselect_b32 s8, s14, s18
+; GFX6-NEXT: s_cselect_b32 s9, s20, s17
+; GFX6-NEXT: s_cselect_b32 s8, s19, s16
; GFX6-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GFX6-NEXT: s_xor_b64 s[6:7], s[8:9], s[4:5]
; GFX6-NEXT: s_sub_u32 s4, s6, s4
; GFX6-NEXT: s_subb_u32 s5, s7, s5
; GFX6-NEXT: s_mov_b32 s2, -1
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mov_b32_e32 v1, s17
+; GFX6-NEXT: v_mov_b32_e32 v0, s14
+; GFX6-NEXT: v_mov_b32_e32 v1, s15
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
@@ -9015,105 +8994,100 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_xor_b64 s[8:9], s[0:1], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s8
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GFX6-NEXT: s_sub_u32 s10, 0, s8
-; GFX6-NEXT: s_subb_u32 s11, 0, s9
+; GFX6-NEXT: s_sub_u32 s0, 0, s8
+; GFX6-NEXT: s_subb_u32 s1, 0, s9
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GFX6-NEXT: v_trunc_f32_e32 v1, v1
; GFX6-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s10, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: s_mul_i32 s1, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_mul_i32 s13, s11, s0
-; GFX6-NEXT: s_mul_i32 s14, s10, s0
-; GFX6-NEXT: s_add_i32 s1, s15, s1
-; GFX6-NEXT: v_mul_hi_u32 v3, v0, s14
-; GFX6-NEXT: s_add_i32 s1, s1, s13
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
-; GFX6-NEXT: v_mul_hi_u32 v4, v1, s14
-; GFX6-NEXT: v_readfirstlane_b32 s13, v3
-; GFX6-NEXT: s_mul_i32 s15, s0, s1
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s1
-; GFX6-NEXT: s_add_u32 s13, s13, s15
-; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: s_mul_i32 s14, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s16, v4
-; GFX6-NEXT: s_add_u32 s13, s13, s14
-; GFX6-NEXT: s_addc_u32 s13, s15, s16
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_addc_u32 s14, s14, 0
-; GFX6-NEXT: s_mul_i32 s1, s12, s1
-; GFX6-NEXT: s_add_u32 s1, s13, s1
-; GFX6-NEXT: s_addc_u32 s13, 0, s14
-; GFX6-NEXT: s_add_u32 s14, s0, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s14
-; GFX6-NEXT: v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s12, s12, s13
-; GFX6-NEXT: s_mul_i32 s0, s10, s12
-; GFX6-NEXT: v_readfirstlane_b32 s1, v0
-; GFX6-NEXT: s_add_i32 s0, s1, s0
-; GFX6-NEXT: s_mul_i32 s11, s11, s14
-; GFX6-NEXT: s_mul_i32 s1, s10, s14
-; GFX6-NEXT: s_add_i32 s0, s0, s11
-; GFX6-NEXT: v_mov_b32_e32 v2, s1
-; GFX6-NEXT: v_mov_b32_e32 v0, s0
-; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s14, v0
-; GFX6-NEXT: s_mul_i32 s11, s14, s0
-; GFX6-NEXT: v_readfirstlane_b32 s15, v2
-; GFX6-NEXT: s_add_u32 s11, s15, s11
-; GFX6-NEXT: v_readfirstlane_b32 s13, v0
-; GFX6-NEXT: s_mul_i32 s1, s12, s1
-; GFX6-NEXT: s_addc_u32 s13, 0, s13
-; GFX6-NEXT: v_readfirstlane_b32 s10, v3
-; GFX6-NEXT: s_add_u32 s1, s11, s1
-; GFX6-NEXT: s_addc_u32 s1, s13, s10
+; GFX6-NEXT: v_mul_hi_u32 v2, s0, v0
; GFX6-NEXT: v_readfirstlane_b32 s10, v1
-; GFX6-NEXT: s_addc_u32 s10, s10, 0
-; GFX6-NEXT: s_mul_i32 s0, s12, s0
-; GFX6-NEXT: s_add_u32 s0, s1, s0
-; GFX6-NEXT: s_addc_u32 s10, 0, s10
-; GFX6-NEXT: s_add_u32 s13, s14, s0
-; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX6-NEXT: s_or_b32 s0, s0, s1
-; GFX6-NEXT: s_addc_u32 s12, s12, s10
+; GFX6-NEXT: v_readfirstlane_b32 s2, v0
+; GFX6-NEXT: s_mul_i32 s11, s0, s10
+; GFX6-NEXT: v_readfirstlane_b32 s14, v2
+; GFX6-NEXT: s_mul_i32 s12, s1, s2
+; GFX6-NEXT: s_mul_i32 s13, s0, s2
+; GFX6-NEXT: s_add_i32 s11, s14, s11
+; GFX6-NEXT: v_mul_hi_u32 v3, v0, s13
+; GFX6-NEXT: s_add_i32 s11, s11, s12
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s11
+; GFX6-NEXT: v_mul_hi_u32 v4, v1, s13
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_mul_i32 s14, s2, s11
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s11
+; GFX6-NEXT: s_add_u32 s12, s12, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_mul_i32 s13, s10, s13
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: v_readfirstlane_b32 s15, v4
+; GFX6-NEXT: s_add_u32 s12, s12, s13
+; GFX6-NEXT: s_addc_u32 s12, s14, s15
+; GFX6-NEXT: v_readfirstlane_b32 s13, v1
+; GFX6-NEXT: s_addc_u32 s13, s13, 0
+; GFX6-NEXT: s_mul_i32 s11, s10, s11
+; GFX6-NEXT: s_add_u32 s11, s12, s11
+; GFX6-NEXT: s_addc_u32 s12, 0, s13
+; GFX6-NEXT: s_add_u32 s11, s2, s11
+; GFX6-NEXT: v_mov_b32_e32 v0, s11
+; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
+; GFX6-NEXT: s_addc_u32 s10, s10, s12
+; GFX6-NEXT: s_mul_i32 s12, s0, s10
+; GFX6-NEXT: s_mul_i32 s1, s1, s11
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
+; GFX6-NEXT: s_add_i32 s12, s13, s12
+; GFX6-NEXT: s_mul_i32 s0, s0, s11
+; GFX6-NEXT: s_add_i32 s1, s12, s1
+; GFX6-NEXT: v_mov_b32_e32 v2, s0
+; GFX6-NEXT: v_mov_b32_e32 v0, s1
+; GFX6-NEXT: v_mul_hi_u32 v3, s10, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s11, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s10, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT: s_mul_i32 s13, s11, s1
+; GFX6-NEXT: v_readfirstlane_b32 s15, v2
+; GFX6-NEXT: s_add_u32 s13, s15, s13
+; GFX6-NEXT: v_readfirstlane_b32 s14, v0
+; GFX6-NEXT: s_mul_i32 s0, s10, s0
+; GFX6-NEXT: s_addc_u32 s14, 0, s14
+; GFX6-NEXT: v_readfirstlane_b32 s12, v3
+; GFX6-NEXT: s_add_u32 s0, s13, s0
+; GFX6-NEXT: s_addc_u32 s0, s14, s12
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s1, s10, s1
+; GFX6-NEXT: s_add_u32 s0, s0, s1
+; GFX6-NEXT: s_addc_u32 s1, 0, s12
+; GFX6-NEXT: s_add_u32 s12, s11, s0
+; GFX6-NEXT: s_addc_u32 s13, s10, s1
; GFX6-NEXT: s_ashr_i32 s10, s7, 31
; GFX6-NEXT: s_add_u32 s0, s6, s10
; GFX6-NEXT: s_mov_b32 s11, s10
; GFX6-NEXT: s_addc_u32 s1, s7, s10
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
; GFX6-NEXT: v_mul_hi_u32 v1, s6, v0
-; GFX6-NEXT: v_mov_b32_e32 v2, s13
+; GFX6-NEXT: v_mov_b32_e32 v2, s12
; GFX6-NEXT: v_mul_hi_u32 v3, s6, v2
; GFX6-NEXT: s_mov_b32 s0, s4
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
; GFX6-NEXT: v_mul_hi_u32 v1, s7, v2
-; GFX6-NEXT: s_mul_i32 s1, s6, s12
+; GFX6-NEXT: s_mul_i32 s1, s6, s13
; GFX6-NEXT: v_readfirstlane_b32 s14, v3
; GFX6-NEXT: v_mul_hi_u32 v0, s7, v0
; GFX6-NEXT: s_add_u32 s1, s14, s1
; GFX6-NEXT: s_addc_u32 s4, 0, s4
-; GFX6-NEXT: s_mul_i32 s13, s7, s13
+; GFX6-NEXT: s_mul_i32 s12, s7, s12
; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: s_add_u32 s1, s1, s13
+; GFX6-NEXT: s_add_u32 s1, s1, s12
; GFX6-NEXT: s_addc_u32 s1, s4, s14
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
; GFX6-NEXT: s_addc_u32 s4, s4, 0
-; GFX6-NEXT: s_mul_i32 s12, s7, s12
+; GFX6-NEXT: s_mul_i32 s12, s7, s13
; GFX6-NEXT: s_add_u32 s12, s1, s12
; GFX6-NEXT: v_mov_b32_e32 v0, s12
; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
@@ -9128,11 +9102,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_mul_i32 s4, s8, s12
; GFX6-NEXT: s_sub_u32 s6, s6, s4
; GFX6-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s4, s5
; GFX6-NEXT: s_subb_u32 s15, s13, s9
; GFX6-NEXT: s_sub_u32 s16, s6, s8
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s17, s12, s13
; GFX6-NEXT: s_subb_u32 s17, s15, 0
; GFX6-NEXT: s_cmp_ge_u32 s17, s9
; GFX6-NEXT: s_cselect_b32 s18, -1, 0
@@ -9141,13 +9113,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_cmp_eq_u32 s17, s9
; GFX6-NEXT: s_cselect_b32 s18, s19, s18
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s15, s15, s9
-; GFX6-NEXT: s_sub_u32 s19, s16, s8
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s15, 0
+; GFX6-NEXT: s_subb_u32 s12, s15, s9
+; GFX6-NEXT: s_sub_u32 s13, s16, s8
+; GFX6-NEXT: s_subb_u32 s12, s12, 0
; GFX6-NEXT: s_cmp_lg_u32 s18, 0
-; GFX6-NEXT: s_cselect_b32 s13, s19, s16
+; GFX6-NEXT: s_cselect_b32 s13, s13, s16
; GFX6-NEXT: s_cselect_b32 s12, s12, s17
; GFX6-NEXT: s_or_b32 s4, s4, s5
; GFX6-NEXT: s_subb_u32 s4, s7, s14
@@ -9164,6 +9134,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
; GFX6-NEXT: s_sub_u32 s4, s4, s10
; GFX6-NEXT: s_subb_u32 s5, s5, s10
; GFX6-NEXT: v_mov_b32_e32 v0, s4
+; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -9405,8 +9376,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s2
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GFX6-NEXT: s_sub_u32 s12, 0, s2
-; GFX6-NEXT: s_subb_u32 s13, 0, s3
+; GFX6-NEXT: s_sub_u32 s6, 0, s2
+; GFX6-NEXT: s_subb_u32 s7, 0, s3
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9415,69 +9386,65 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT: v_readfirstlane_b32 s14, v1
-; GFX6-NEXT: v_readfirstlane_b32 s6, v0
-; GFX6-NEXT: s_mul_i32 s7, s12, s14
+; GFX6-NEXT: v_mul_hi_u32 v2, s6, v0
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: v_readfirstlane_b32 s13, v0
+; GFX6-NEXT: s_mul_i32 s14, s6, s12
; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_mul_i32 s15, s13, s6
-; GFX6-NEXT: s_mul_i32 s16, s12, s6
-; GFX6-NEXT: s_add_i32 s7, s17, s7
+; GFX6-NEXT: s_mul_i32 s15, s7, s13
+; GFX6-NEXT: s_mul_i32 s16, s6, s13
+; GFX6-NEXT: s_add_i32 s14, s17, s14
; GFX6-NEXT: v_mul_hi_u32 v3, v0, s16
-; GFX6-NEXT: s_add_i32 s7, s7, s15
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s7
+; GFX6-NEXT: s_add_i32 s14, s14, s15
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s14
; GFX6-NEXT: v_mul_hi_u32 v4, v1, s16
; GFX6-NEXT: v_readfirstlane_b32 s15, v3
-; GFX6-NEXT: s_mul_i32 s18, s6, s7
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s7
+; GFX6-NEXT: s_mul_i32 s18, s13, s14
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s14
; GFX6-NEXT: s_add_u32 s15, s15, s18
; GFX6-NEXT: v_readfirstlane_b32 s18, v0
-; GFX6-NEXT: s_mul_i32 s16, s14, s16
+; GFX6-NEXT: s_mul_i32 s16, s12, s16
; GFX6-NEXT: s_addc_u32 s18, 0, s18
; GFX6-NEXT: v_readfirstlane_b32 s17, v4
; GFX6-NEXT: s_add_u32 s15, s15, s16
; GFX6-NEXT: s_addc_u32 s15, s18, s17
; GFX6-NEXT: v_readfirstlane_b32 s16, v1
; GFX6-NEXT: s_addc_u32 s16, s16, 0
-; GFX6-NEXT: s_mul_i32 s7, s14, s7
-; GFX6-NEXT: s_add_u32 s7, s15, s7
+; GFX6-NEXT: s_mul_i32 s14, s12, s14
+; GFX6-NEXT: s_add_u32 s14, s15, s14
; GFX6-NEXT: s_addc_u32 s15, 0, s16
-; GFX6-NEXT: s_add_u32 s16, s6, s7
-; GFX6-NEXT: v_mov_b32_e32 v0, s16
-; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_addc_u32 s14, s14, s15
-; GFX6-NEXT: s_mul_i32 s6, s12, s14
-; GFX6-NEXT: v_readfirstlane_b32 s7, v0
-; GFX6-NEXT: s_add_i32 s6, s7, s6
-; GFX6-NEXT: s_mul_i32 s13, s13, s16
-; GFX6-NEXT: s_mul_i32 s7, s12, s16
-; GFX6-NEXT: s_add_i32 s6, s6, s13
-; GFX6-NEXT: v_mov_b32_e32 v2, s7
-; GFX6-NEXT: v_mov_b32_e32 v0, s6
-; GFX6-NEXT: v_mul_hi_u32 v3, s14, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s16, v2
-; GFX6-NEXT: v_mul_hi_u32 v1, s14, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s16, v0
-; GFX6-NEXT: s_mul_i32 s13, s16, s6
-; GFX6-NEXT: v_readfirstlane_b32 s17, v2
-; GFX6-NEXT: s_add_u32 s13, s17, s13
+; GFX6-NEXT: s_add_u32 s13, s13, s14
+; GFX6-NEXT: v_mov_b32_e32 v0, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, s6, v0
+; GFX6-NEXT: s_addc_u32 s12, s12, s15
+; GFX6-NEXT: s_mul_i32 s14, s6, s12
+; GFX6-NEXT: s_mul_i32 s7, s7, s13
; GFX6-NEXT: v_readfirstlane_b32 s15, v0
-; GFX6-NEXT: s_mul_i32 s7, s14, s7
-; GFX6-NEXT: s_addc_u32 s15, 0, s15
-; GFX6-NEXT: v_readfirstlane_b32 s12, v3
-; GFX6-NEXT: s_add_u32 s7, s13, s7
-; GFX6-NEXT: s_addc_u32 s7, s15, s12
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: s_addc_u32 s12, s12, 0
-; GFX6-NEXT: s_mul_i32 s6, s14, s6
-; GFX6-NEXT: s_add_u32 s6, s7, s6
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: s_add_u32 s13, s16, s6
-; GFX6-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GFX6-NEXT: s_or_b32 s6, s6, s7
-; GFX6-NEXT: s_addc_u32 s12, s14, s12
+; GFX6-NEXT: s_add_i32 s14, s15, s14
+; GFX6-NEXT: s_mul_i32 s6, s6, s13
+; GFX6-NEXT: s_add_i32 s7, s14, s7
+; GFX6-NEXT: v_mov_b32_e32 v2, s6
+; GFX6-NEXT: v_mov_b32_e32 v0, s7
+; GFX6-NEXT: v_mul_hi_u32 v3, s12, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s13, v2
+; GFX6-NEXT: v_mul_hi_u32 v1, s12, v0
+; GFX6-NEXT: v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT: s_mul_i32 s15, s13, s7
+; GFX6-NEXT: v_readfirstlane_b32 s17, v2
+; GFX6-NEXT: s_add_u32 s15, s17, s15
+; GFX6-NEXT: v_readfirstlane_b32 s16, v0
+; GFX6-NEXT: s_mul_i32 s6, s12, s6
+; GFX6-NEXT: s_addc_u32 s16, 0, s16
+; GFX6-NEXT: v_readfirstlane_b32 s14, v3
+; GFX6-NEXT: s_add_u32 s6, s15, s6
+; GFX6-NEXT: s_addc_u32 s6, s16, s14
+; GFX6-NEXT: v_readfirstlane_b32 s14, v1
+; GFX6-NEXT: s_addc_u32 s14, s14, 0
+; GFX6-NEXT: s_mul_i32 s7, s12, s7
+; GFX6-NEXT: s_add_u32 s6, s6, s7
+; GFX6-NEXT: s_addc_u32 s7, 0, s14
+; GFX6-NEXT: s_add_u32 s13, s13, s6
+; GFX6-NEXT: s_addc_u32 s12, s12, s7
; GFX6-NEXT: s_ashr_i32 s6, s9, 31
; GFX6-NEXT: s_add_u32 s8, s8, s6
; GFX6-NEXT: s_mov_b32 s7, s6
@@ -9514,11 +9481,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_mul_i32 s12, s2, s12
; GFX6-NEXT: s_sub_u32 s8, s8, s12
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s15, s12, s13
; GFX6-NEXT: s_subb_u32 s17, s14, s3
; GFX6-NEXT: s_sub_u32 s18, s8, s2
; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s14, s15
; GFX6-NEXT: s_subb_u32 s19, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s19, s3
; GFX6-NEXT: s_cselect_b32 s20, -1, 0
@@ -9527,13 +9492,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s19, s3
; GFX6-NEXT: s_cselect_b32 s20, s21, s20
; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s17, s17, s3
-; GFX6-NEXT: s_sub_u32 s21, s18, s2
-; GFX6-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT: s_or_b32 s14, s14, s15
-; GFX6-NEXT: s_subb_u32 s14, s17, 0
+; GFX6-NEXT: s_subb_u32 s14, s17, s3
+; GFX6-NEXT: s_sub_u32 s15, s18, s2
+; GFX6-NEXT: s_subb_u32 s14, s14, 0
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s15, s21, s18
+; GFX6-NEXT: s_cselect_b32 s15, s15, s18
; GFX6-NEXT: s_cselect_b32 s14, s14, s19
; GFX6-NEXT: s_or_b32 s12, s12, s13
; GFX6-NEXT: s_subb_u32 s9, s9, s16
@@ -9556,8 +9519,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_xor_b64 s[6:7], s[0:1], s[2:3]
; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s6
; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s7
-; GFX6-NEXT: s_sub_u32 s8, 0, s6
-; GFX6-NEXT: s_subb_u32 s9, 0, s7
+; GFX6-NEXT: s_sub_u32 s2, 0, s6
+; GFX6-NEXT: s_subb_u32 s3, 0, s7
; GFX6-NEXT: v_mac_f32_e32 v0, 0x4f800000, v1
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9566,70 +9529,66 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s12, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v0
-; GFX6-NEXT: s_mul_i32 s1, s8, s12
-; GFX6-NEXT: v_readfirstlane_b32 s3, v2
-; GFX6-NEXT: s_mul_i32 s0, s9, s2
-; GFX6-NEXT: s_add_i32 s1, s3, s1
-; GFX6-NEXT: s_add_i32 s3, s1, s0
-; GFX6-NEXT: s_mul_i32 s13, s8, s2
-; GFX6-NEXT: v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT: v_mul_hi_u32 v0, v0, s13
-; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT: s_mul_i32 s4, s2, s3
-; GFX6-NEXT: v_readfirstlane_b32 s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s2, v0
+; GFX6-NEXT: v_readfirstlane_b32 s8, v1
+; GFX6-NEXT: v_readfirstlane_b32 s0, v0
+; GFX6-NEXT: s_mul_i32 s9, s2, s8
+; GFX6-NEXT: v_readfirstlane_b32 s12, v2
+; GFX6-NEXT: s_mul_i32 s1, s3, s0
+; GFX6-NEXT: s_add_i32 s9, s12, s9
+; GFX6-NEXT: s_add_i32 s9, s9, s1
+; GFX6-NEXT: s_mul_i32 s1, s2, s0
+; GFX6-NEXT: v_mul_hi_u32 v2, v0, s9
+; GFX6-NEXT: v_mul_hi_u32 v0, v0, s1
+; GFX6-NEXT: s_mul_i32 s12, s0, s9
+; GFX6-NEXT: v_readfirstlane_b32 s13, v2
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, v1, s13
-; GFX6-NEXT: v_mul_hi_u32 v1, v1, s3
-; GFX6-NEXT: s_add_u32 s4, s16, s4
-; GFX6-NEXT: s_addc_u32 s5, 0, s5
-; GFX6-NEXT: s_mul_i32 s13, s12, s13
+; GFX6-NEXT: v_mul_hi_u32 v0, v1, s1
+; GFX6-NEXT: v_mul_hi_u32 v1, v1, s9
+; GFX6-NEXT: s_add_u32 s12, s16, s12
+; GFX6-NEXT: s_addc_u32 s13, 0, s13
+; GFX6-NEXT: s_mul_i32 s1, s8, s1
; GFX6-NEXT: v_readfirstlane_b32 s16, v0
-; GFX6-NEXT: s_add_u32 s4, s4, s13
-; GFX6-NEXT: s_addc_u32 s4, s5, s16
-; GFX6-NEXT: v_readfirstlane_b32 s5, v1
-; GFX6-NEXT: s_addc_u32 s5, s5, 0
-; GFX6-NEXT: s_mul_i32 s3, s12, s3
-; GFX6-NEXT: s_add_u32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s4, 0, s5
-; GFX6-NEXT: s_add_u32 s5, s2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s5
-; GFX6-NEXT: v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s4, s12, s4
-; GFX6-NEXT: s_mul_i32 s2, s8, s4
-; GFX6-NEXT: v_readfirstlane_b32 s3, v0
-; GFX6-NEXT: s_add_i32 s2, s3, s2
-; GFX6-NEXT: s_mul_i32 s9, s9, s5
-; GFX6-NEXT: s_mul_i32 s3, s8, s5
-; GFX6-NEXT: s_add_i32 s2, s2, s9
-; GFX6-NEXT: v_mov_b32_e32 v2, s3
-; GFX6-NEXT: v_mov_b32_e32 v0, s2
+; GFX6-NEXT: s_add_u32 s1, s12, s1
+; GFX6-NEXT: s_addc_u32 s1, s13, s16
+; GFX6-NEXT: v_readfirstlane_b32 s12, v1
+; GFX6-NEXT: s_addc_u32 s12, s12, 0
+; GFX6-NEXT: s_mul_i32 s9, s8, s9
+; GFX6-NEXT: s_add_u32 s1, s1, s9
+; GFX6-NEXT: s_addc_u32 s9, 0, s12
+; GFX6-NEXT: s_add_u32 s12, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s12
+; GFX6-NEXT: v_mul_hi_u32 v0, s2, v0
+; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT: s_addc_u32 s4, s8, s9
+; GFX6-NEXT: s_mul_i32 s5, s2, s4
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: s_add_i32 s5, s8, s5
+; GFX6-NEXT: s_mul_i32 s3, s3, s12
+; GFX6-NEXT: s_mul_i32 s2, s2, s12
+; GFX6-NEXT: s_add_i32 s3, s5, s3
+; GFX6-NEXT: v_mov_b32_e32 v2, s2
+; GFX6-NEXT: v_mov_b32_e32 v0, s3
; GFX6-NEXT: v_mul_hi_u32 v3, s4, v2
-; GFX6-NEXT: v_mul_hi_u32 v2, s5, v2
+; GFX6-NEXT: v_mul_hi_u32 v2, s12, v2
; GFX6-NEXT: v_mul_hi_u32 v1, s4, v0
-; GFX6-NEXT: v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT: s_mul_i32 s9, s5, s2
+; GFX6-NEXT: v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT: s_mul_i32 s8, s12, s3
; GFX6-NEXT: v_readfirstlane_b32 s13, v2
-; GFX6-NEXT: s_add_u32 s9, s13, s9
-; GFX6-NEXT: v_readfirstlane_b32 s12, v0
-; GFX6-NEXT: s_mul_i32 s3, s4, s3
-; GFX6-NEXT: s_addc_u32 s12, 0, s12
-; GFX6-NEXT: v_readfirstlane_b32 s8, v3
-; GFX6-NEXT: s_add_u32 s3, s9, s3
-; GFX6-NEXT: s_addc_u32 s3, s12, s8
-; GFX6-NEXT: v_readfirstlane_b32 s8, v1
-; GFX6-NEXT: s_addc_u32 s8, s8, 0
+; GFX6-NEXT: s_add_u32 s8, s13, s8
+; GFX6-NEXT: v_readfirstlane_b32 s9, v0
; GFX6-NEXT: s_mul_i32 s2, s4, s2
-; GFX6-NEXT: s_add_u32 s2, s3, s2
-; GFX6-NEXT: s_addc_u32 s8, 0, s8
-; GFX6-NEXT: s_add_u32 s12, s5, s2
-; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0
-; GFX6-NEXT: s_or_b32 s2, s2, s3
-; GFX6-NEXT: s_addc_u32 s13, s4, s8
+; GFX6-NEXT: s_addc_u32 s9, 0, s9
+; GFX6-NEXT: v_readfirstlane_b32 s5, v3
+; GFX6-NEXT: s_add_u32 s2, s8, s2
+; GFX6-NEXT: s_addc_u32 s2, s9, s5
+; GFX6-NEXT: v_readfirstlane_b32 s5, v1
+; GFX6-NEXT: s_addc_u32 s5, s5, 0
+; GFX6-NEXT: s_mul_i32 s3, s4, s3
+; GFX6-NEXT: s_add_u32 s2, s2, s3
+; GFX6-NEXT: s_addc_u32 s3, 0, s5
+; GFX6-NEXT: s_add_u32 s12, s12, s2
+; GFX6-NEXT: s_addc_u32 s13, s4, s3
; GFX6-NEXT: s_ashr_i32 s4, s11, 31
; GFX6-NEXT: s_add_u32 s2, s10, s4
; GFX6-NEXT: s_mov_b32 s5, s4
@@ -9667,11 +9626,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_mul_i32 s10, s6, s11
; GFX6-NEXT: s_sub_u32 s8, s8, s10
; GFX6-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT: s_or_b32 s13, s10, s11
; GFX6-NEXT: s_subb_u32 s17, s12, s7
; GFX6-NEXT: s_sub_u32 s18, s8, s6
; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s19, s12, s13
; GFX6-NEXT: s_subb_u32 s19, s17, 0
; GFX6-NEXT: s_cmp_ge_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s20, -1, 0
@@ -9680,13 +9637,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
; GFX6-NEXT: s_cmp_eq_u32 s19, s7
; GFX6-NEXT: s_cselect_b32 s20, s21, s20
; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s17, s17, s7
-; GFX6-NEXT: s_sub_u32 s21, s18, s6
-; GFX6-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT: s_or_b32 s12, s12, s13
-; GFX6-NEXT: s_subb_u32 s12, s17, 0
+; GFX6-NEXT: s_subb_u32 s12, s17, s7
+; GFX6-NEXT: s_sub_u32 s13, s18, s6
+; GFX6-NEXT: s_subb_u32 s12, s12, 0
; GFX6-NEXT: s_cmp_lg_u32 s20, 0
-; GFX6-NEXT: s_cselect_b32 s13, s21, s18
+; GFX6-NEXT: s_cselect_b32 s13, s13, s18
; GFX6-NEXT: s_cselect_b32 s12, s12, s19
; GFX6-NEXT: s_or_b32 s10, s10, s11
; GFX6-NEXT: s_subb_u32 s9, s9, s16
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index c962c05..5d79696 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -239,7 +239,8 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
+; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 0)
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -308,7 +309,8 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
-; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
+; PASS-CHECK-NEXT: [[FIRST_ACTIVE_ID:%.*]] = tail call noundef i32 @llvm.amdgcn.readfirstlane.i32(i32 [[MYMASK]])
+; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[FIRST_ACTIVE_ID]]
; PASS-CHECK-NEXT: br i1 [[IS_FIRST_ACTIVE_ID]], label %[[WORK:.*]], label %[[TAIL]]
; PASS-CHECK: [[WORK]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index a7e828c..402ccd9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -248,12 +248,14 @@ define amdgpu_kernel void @readfirstlane_constant(ptr addrspace(1) %out) {
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_constant(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 7, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 7)
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 7)
@@ -269,12 +271,14 @@ define amdgpu_kernel void @readfirstlane_with_argument(ptr addrspace(1) %out, i3
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_argument(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], i32 [[SRC0:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 [[SRC0]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[SRC0]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v = call i32 @llvm.amdgcn.readfirstlane(i32 %src0)
@@ -360,12 +364,16 @@ define amdgpu_kernel void @readfirstlane_with_readfirstlane(ptr addrspace(1) %ou
;
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readfirstlane(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
-; DCE-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 5)
+; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%v1 = call i32 @llvm.amdgcn.readfirstlane(i32 5)
@@ -388,7 +396,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; PASS-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; PASS-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; PASS-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; PASS-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; PASS-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_with_readlane(
@@ -396,7 +405,8 @@ define amdgpu_kernel void @readfirstlane_with_readlane(ptr addrspace(1) %out) {
; DCE-CHECK-NEXT: [[TIDX:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
; DCE-CHECK-NEXT: [[TIDY:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
; DCE-CHECK-NEXT: [[V1:%.*]] = call i32 @llvm.amdgcn.readlane.i32(i32 [[TIDX]], i32 [[TIDY]])
-; DCE-CHECK-NEXT: store i32 [[V1]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V2:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[V1]])
+; DCE-CHECK-NEXT: store i32 [[V2]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%tidx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -537,13 +547,15 @@ define amdgpu_kernel void @readfirstlane_random(ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; PASS-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; PASS-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; PASS-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; PASS-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; PASS-CHECK-NEXT: ret void
;
; DCE-CHECK-LABEL: define amdgpu_kernel void @readfirstlane_random(
; DCE-CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; DCE-CHECK-NEXT: [[RANDOM:%.*]] = xor i32 123, 456
-; DCE-CHECK-NEXT: store i32 [[RANDOM]], ptr addrspace(1) [[OUT]], align 4
+; DCE-CHECK-NEXT: [[V:%.*]] = call i32 @llvm.amdgcn.readfirstlane.i32(i32 [[RANDOM]])
+; DCE-CHECK-NEXT: store i32 [[V]], ptr addrspace(1) [[OUT]], align 4
; DCE-CHECK-NEXT: ret void
;
%random = xor i32 123, 456
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
new file mode 100644
index 0000000..04e4724
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg-bfloat.ll
@@ -0,0 +1,130 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+
+; We've separated this file from call-args-inreg.ll since GlobalISel does not support the bfloat type.
+; Ideally, we should merge the two files once that support lands.
+
+declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
+declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
+
+define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s17, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-NEXT: v_writelane_b32 v40, s17, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s1, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_writelane_b32 v40, s1, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ call void @external_void_func_bf16_inreg(bfloat inreg %arg)
+ ret void
+}
+
+define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
+; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: s_mov_b32 s17, s33
+; GFX9-NEXT: s_mov_b32 s33, s32
+; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-NEXT: v_writelane_b32 v40, s17, 2
+; GFX9-NEXT: s_addk_i32 s32, 0x400
+; GFX9-NEXT: v_writelane_b32 v40, s30, 0
+; GFX9-NEXT: s_getpc_b64 s[18:19]
+; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4
+; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12
+; GFX9-NEXT: s_mov_b32 s0, s16
+; GFX9-NEXT: v_writelane_b32 v40, s31, 1
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GFX9-NEXT: v_readlane_b32 s31, v40, 1
+; GFX9-NEXT: v_readlane_b32 s30, v40, 0
+; GFX9-NEXT: s_mov_b32 s32, s33
+; GFX9-NEXT: v_readlane_b32 s4, v40, 2
+; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT: s_mov_b64 exec, s[6:7]
+; GFX9-NEXT: s_mov_b32 s33, s4
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_mov_b32 s1, s33
+; GFX11-NEXT: s_mov_b32 s33, s32
+; GFX11-NEXT: s_or_saveexec_b32 s2, -1
+; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_writelane_b32 v40, s1, 2
+; GFX11-NEXT: s_add_i32 s32, s32, 16
+; GFX11-NEXT: s_getpc_b64 s[2:3]
+; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
+; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
+; GFX11-NEXT: v_writelane_b32 v40, s30, 0
+; GFX11-NEXT: v_writelane_b32 v40, s31, 1
+; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_readlane_b32 s31, v40, 1
+; GFX11-NEXT: v_readlane_b32 s30, v40, 0
+; GFX11-NEXT: s_mov_b32 s32, s33
+; GFX11-NEXT: v_readlane_b32 s0, v40, 2
+; GFX11-NEXT: s_or_saveexec_b32 s1, -1
+; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT: s_mov_b32 exec_lo, s1
+; GFX11-NEXT: s_mov_b32 s33, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
index d1cede6..f96007a 100644
--- a/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-args-inreg.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GISEL %s
declare hidden void @external_void_func_i8_inreg(i8 inreg) #0
declare hidden void @external_void_func_i16_inreg(i32 inreg) #0
@@ -12,11 +14,9 @@ declare hidden void @external_void_func_v4i32_inreg(<4 x i32> inreg) #0
declare hidden void @external_void_func_v8i32_inreg(<8 x i32> inreg) #0
declare hidden void @external_void_func_v16i32_inreg(<16 x i32> inreg) #0
declare hidden void @external_void_func_f16_inreg(half inreg) #0
-declare hidden void @external_void_func_bf16_inreg(bfloat inreg) #0
declare hidden void @external_void_func_f32_inreg(float inreg) #0
declare hidden void @external_void_func_f64_inreg(double inreg) #0
declare hidden void @external_void_func_v2f16_inreg(<2 x half> inreg) #0
-declare hidden void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg) #0
declare hidden void @external_void_func_v3f16_inreg(<3 x half> inreg) #0
declare hidden void @external_void_func_v4f16_inreg(<4 x half> inreg) #0
@@ -212,35 +212,6 @@ define void @test_call_external_void_func_i32_inreg(i32 inreg %arg) #0 {
}
define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_i64_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_i64_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_i64_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_i64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -273,35 +244,6 @@ define void @test_call_external_void_func_i64_inreg(i64 inreg %arg) #0 {
}
define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2i32_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2i32_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v2i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -334,36 +276,6 @@ define void @test_call_external_void_func_v2i32_inreg(<2 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v3i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s19, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s19, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v3i32_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v3i32_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v3i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -396,37 +308,6 @@ define void @test_call_external_void_func_v3i32_inreg(<3 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v4i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s20, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-NEXT: v_writelane_b32 v40, s20, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v4i32_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v4i32_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s19
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v4i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -459,41 +340,6 @@ define void @test_call_external_void_func_v4i32_inreg(<4 x i32> inreg %arg) #0 {
}
define void @test_call_external_void_func_v8i32_inreg(<8 x i32> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v8i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s24, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[26:27], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[26:27]
-; GFX9-NEXT: v_writelane_b32 v40, s24, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[24:25]
-; GFX9-NEXT: s_add_u32 s24, s24, external_void_func_v8i32_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s25, s25, external_void_func_v8i32_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s19
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: s_mov_b32 s16, s20
-; GFX9-NEXT: s_mov_b32 s17, s21
-; GFX9-NEXT: s_mov_b32 s18, s22
-; GFX9-NEXT: s_mov_b32 s19, s23
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[24:25]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v8i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -585,66 +431,6 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 {
ret void
}
-define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_bf16_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s17, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_bf16_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_bf16_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_external_void_func_bf16_inreg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s1, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_writelane_b32 v40, s1, 2
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_bf16_inreg@rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_bf16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- call void @external_void_func_bf16_inreg(bfloat inreg %arg)
- ret void
-}
-
define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
; GFX9-LABEL: test_call_external_void_func_f32_inreg:
; GFX9: ; %bb.0:
@@ -706,35 +492,6 @@ define void @test_call_external_void_func_f32_inreg(float inreg %arg) #0 {
}
define void @test_call_external_void_func_f64_inreg(double inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_f64_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_f64_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_f64_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_f64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -826,97 +583,7 @@ define void @test_call_external_void_func_v2f16_inreg(<2 x half> inreg %arg) #0
ret void
}
-
-define void @test_call_external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2bf16_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s17, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[18:19]
-; GFX9-NEXT: v_writelane_b32 v40, s17, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2bf16_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2bf16_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: test_call_external_void_func_v2bf16_inreg:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_mov_b32 s1, s33
-; GFX11-NEXT: s_mov_b32 s33, s32
-; GFX11-NEXT: s_or_saveexec_b32 s2, -1
-; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_writelane_b32 v40, s1, 2
-; GFX11-NEXT: s_add_i32 s32, s32, 16
-; GFX11-NEXT: s_getpc_b64 s[2:3]
-; GFX11-NEXT: s_add_u32 s2, s2, external_void_func_v2bf16_inreg@rel32@lo+4
-; GFX11-NEXT: s_addc_u32 s3, s3, external_void_func_v2bf16_inreg@rel32@hi+12
-; GFX11-NEXT: v_writelane_b32 v40, s30, 0
-; GFX11-NEXT: v_writelane_b32 v40, s31, 1
-; GFX11-NEXT: s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_readlane_b32 s31, v40, 1
-; GFX11-NEXT: v_readlane_b32 s30, v40, 0
-; GFX11-NEXT: s_mov_b32 s32, s33
-; GFX11-NEXT: v_readlane_b32 s0, v40, 2
-; GFX11-NEXT: s_or_saveexec_b32 s1, -1
-; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT: s_mov_b32 exec_lo, s1
-; GFX11-NEXT: s_mov_b32 s33, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
- call void @external_void_func_v2bf16_inreg(<2 x bfloat> inreg %arg)
- ret void
-}
-
define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v3f16_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v3f16_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v3f16_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v3f16_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -949,35 +616,6 @@ define void @test_call_external_void_func_v3f16_inreg(<3 x half> inreg %arg) #0
}
define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v4f16_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v4f16_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v4f16_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v4f16_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1010,35 +648,6 @@ define void @test_call_external_void_func_v4f16_inreg(<4 x half> inreg %arg) #0
}
define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_p0_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p0_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p0_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_p0_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1071,35 +680,6 @@ define void @test_call_external_void_func_p0_inreg(ptr inreg %arg) #0 {
}
define void @test_call_external_void_func_p1_inreg(ptr addrspace(1) inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_p1_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_p1_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_p1_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_p1_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1192,37 +772,6 @@ define void @test_call_external_void_func_p3_inreg(ptr addrspace(3) inreg %arg)
}
define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2p1_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s20, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-NEXT: v_writelane_b32 v40, s20, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[20:21]
-; GFX9-NEXT: s_add_u32 s20, s20, external_void_func_v2p1_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s21, s21, external_void_func_v2p1_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s19
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[20:21]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v2p1_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1255,35 +804,6 @@ define void @test_call_external_void_func_v2p1_inreg(<2 x ptr addrspace(1)> inre
}
define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inreg %arg) #0 {
-; GFX9-LABEL: test_call_external_void_func_v2p5_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s18, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[20:21], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[20:21]
-; GFX9-NEXT: v_writelane_b32 v40, s18, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[18:19]
-; GFX9-NEXT: s_add_u32 s18, s18, external_void_func_v2p5_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s19, s19, external_void_func_v2p5_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[18:19]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_v2p5_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1316,38 +836,6 @@ define void @test_call_external_void_func_v2p5_inreg(<2 x ptr addrspace(5)> inre
}
define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inreg %arg0, i32 inreg %arg1, i64 inreg %arg2) #0 {
-; GFX9-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s21, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-NEXT: v_writelane_b32 v40, s21, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[22:23]
-; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_i64_inreg_i32_inreg_i64_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s19
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: s_mov_b32 s16, s20
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1380,46 +868,6 @@ define void @test_call_external_void_func_i64_inreg_i32_inreg_i64_inreg(i64 inre
}
define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #0 {
-; GFX9-LABEL: test_call_external_void_func_a15i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s29, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[40:41], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[40:41]
-; GFX9-NEXT: v_writelane_b32 v40, s29, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[40:41]
-; GFX9-NEXT: s_add_u32 s40, s40, external_void_func_a15i32_inreg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s41, s41, external_void_func_a15i32_inreg@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s19
-; GFX9-NEXT: s_mov_b32 s2, s18
-; GFX9-NEXT: s_mov_b32 s1, s17
-; GFX9-NEXT: s_mov_b32 s0, s16
-; GFX9-NEXT: s_mov_b32 s16, s20
-; GFX9-NEXT: s_mov_b32 s17, s21
-; GFX9-NEXT: s_mov_b32 s18, s22
-; GFX9-NEXT: s_mov_b32 s19, s23
-; GFX9-NEXT: s_mov_b32 s20, s24
-; GFX9-NEXT: s_mov_b32 s21, s25
-; GFX9-NEXT: s_mov_b32 s22, s26
-; GFX9-NEXT: s_mov_b32 s23, s27
-; GFX9-NEXT: s_mov_b32 s24, s28
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[40:41]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1454,47 +902,6 @@ define void @test_call_external_void_func_a15i32_inreg([13 x i32] inreg %arg0) #
; FIXME: This should also fail
define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inreg %arg0, i32 inreg %arg1) #1 {
-; GFX9-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: s_mov_b32 s21, s33
-; GFX9-NEXT: s_mov_b32 s33, s32
-; GFX9-NEXT: s_or_saveexec_b64 s[22:23], -1
-; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT: s_mov_b64 exec, s[22:23]
-; GFX9-NEXT: v_writelane_b32 v40, s21, 2
-; GFX9-NEXT: s_addk_i32 s32, 0x400
-; GFX9-NEXT: v_writelane_b32 v40, s30, 0
-; GFX9-NEXT: s_getpc_b64 s[22:23]
-; GFX9-NEXT: s_add_u32 s22, s22, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s23, s23, external_void_func_a15i32_inreg_i32_inreg__noimplicit@rel32@hi+12
-; GFX9-NEXT: s_mov_b32 s3, s7
-; GFX9-NEXT: s_mov_b32 s2, s6
-; GFX9-NEXT: s_mov_b32 s1, s5
-; GFX9-NEXT: s_mov_b32 s0, s4
-; GFX9-NEXT: s_mov_b32 s4, s8
-; GFX9-NEXT: s_mov_b32 s5, s9
-; GFX9-NEXT: s_mov_b32 s6, s10
-; GFX9-NEXT: s_mov_b32 s7, s11
-; GFX9-NEXT: s_mov_b32 s8, s15
-; GFX9-NEXT: s_mov_b32 s9, s16
-; GFX9-NEXT: s_mov_b32 s10, s17
-; GFX9-NEXT: s_mov_b32 s11, s18
-; GFX9-NEXT: s_mov_b32 s15, s19
-; GFX9-NEXT: s_mov_b32 s16, s20
-; GFX9-NEXT: v_writelane_b32 v40, s31, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[22:23]
-; GFX9-NEXT: v_readlane_b32 s31, v40, 1
-; GFX9-NEXT: v_readlane_b32 s30, v40, 0
-; GFX9-NEXT: s_mov_b32 s32, s33
-; GFX9-NEXT: v_readlane_b32 s4, v40, 2
-; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT: s_mov_b64 exec, s[6:7]
-; GFX9-NEXT: s_mov_b32 s33, s4
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-LABEL: test_call_external_void_func_a15i32_inreg_i32_inreg:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1529,3 +936,6 @@ define void @test_call_external_void_func_a15i32_inreg_i32_inreg([13 x i32] inre
attributes #0 = { nounwind }
attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-work-group-id-x" "amdgpu-no-work-group-id-y" "amdgpu-no-work-group-id-z" "amdgpu-no-work-item-id-x" "amdgpu-no-work-item-id-y" "amdgpu-no-work-item-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index 832e43f..c407f76 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1,10 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
; RUN: llc -mtriple=amdgcn -mcpu=hawaii -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,SDAG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=HSA %s
+; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-scalarize-global-loads=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GISEL %s
declare hidden void @external_void_func_i1(i1) #0
declare hidden void @external_void_func_i1_signext(i1 signext) #0
@@ -100,24 +101,24 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i1_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_imm:
; GFX11: ; %bb.0:
@@ -145,6 +146,25 @@ define amdgpu_kernel void @test_call_external_void_func_i1_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_i1(i1 true)
ret void
}
@@ -196,28 +216,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i1_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_signext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: v_bfe_i32 v0, v0, 0, 1
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_signext:
; GFX11: ; %bb.0:
@@ -253,6 +273,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_signext(i32) #0 {
; HSA-NEXT: v_bfe_i32 v0, v0, 0, 1
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_signext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_signext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_signext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i1, ptr addrspace(1) poison
call void @external_void_func_i1_signext(i1 signext %var)
ret void
@@ -306,28 +349,28 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i1_zeroext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i1_zeroext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: v_and_b32_e32 v0, 1, v0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i1_zeroext:
; GFX11: ; %bb.0:
@@ -363,6 +406,29 @@ define amdgpu_kernel void @test_call_external_void_func_i1_zeroext(i32) #0 {
; HSA-NEXT: v_and_b32_e32 v0, 1, v0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i1_zeroext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i1_zeroext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i1_zeroext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i1, ptr addrspace(1) poison
call void @external_void_func_i1_zeroext(i1 zeroext %var)
ret void
@@ -407,24 +473,24 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i8_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_i8_imm:
; GFX11-TRUE16: ; %bb.0:
@@ -463,6 +529,25 @@ define amdgpu_kernel void @test_call_external_void_func_i8_imm(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_i8(i8 123)
ret void
}
@@ -513,27 +598,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i8_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_signext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_signext:
; GFX11: ; %bb.0:
@@ -567,6 +652,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_signext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_sbyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_signext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_signext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i8, ptr addrspace(1) poison
call void @external_void_func_i8_signext(i8 signext %var)
ret void
@@ -617,27 +724,27 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i8_zeroext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i8_zeroext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i8_zeroext:
; GFX11: ; %bb.0:
@@ -671,6 +778,28 @@ define amdgpu_kernel void @test_call_external_void_func_i8_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i8_zeroext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i8_zeroext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i8_zeroext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i8, ptr addrspace(1) poison
call void @external_void_func_i8_zeroext(i8 zeroext %var)
ret void
@@ -715,24 +844,24 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i16_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_i16_imm:
; GFX11-TRUE16: ; %bb.0:
@@ -771,6 +900,25 @@ define amdgpu_kernel void @test_call_external_void_func_i16_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_i16(i16 123)
ret void
}
@@ -820,27 +968,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i16_signext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_signext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_signext:
; GFX11: ; %bb.0:
@@ -874,6 +1022,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_signext(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_signext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_sshort v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_signext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_signext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i16, ptr addrspace(1) poison
call void @external_void_func_i16_signext(i16 signext %var)
ret void
@@ -924,27 +1094,27 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i16_zeroext:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i16_zeroext:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i16_zeroext:
; GFX11: ; %bb.0:
@@ -978,6 +1148,28 @@ define amdgpu_kernel void @test_call_external_void_func_i16_zeroext(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i16_zeroext:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: buffer_load_ushort v0, off, s[0:3], 0 glc
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i16_zeroext@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i16_zeroext@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%var = load volatile i16, ptr addrspace(1) poison
call void @external_void_func_i16_zeroext(i16 zeroext %var)
ret void
@@ -1022,24 +1214,24 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 42
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i32_imm:
; GFX11: ; %bb.0:
@@ -1067,6 +1259,25 @@ define amdgpu_kernel void @test_call_external_void_func_i32_imm(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_i32(i32 42)
ret void
}
@@ -1112,25 +1323,25 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_i64_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
-; GFX9-NEXT: v_mov_b32_e32 v1, 0
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_i64_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x7b
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_i64_imm:
; GFX11: ; %bb.0:
@@ -1159,6 +1370,26 @@ define amdgpu_kernel void @test_call_external_void_func_i64_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_i64_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_i64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_i64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x7b
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_i64(i64 123)
ret void
}
@@ -1208,27 +1439,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], 0
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i64:
; GFX11: ; %bb.0:
@@ -1262,6 +1493,31 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <2 x i64>, ptr addrspace(1) null
call void @external_void_func_v2i64(<2 x i64> %val)
ret void
@@ -1312,27 +1568,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2i64_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: v_mov_b32_e32 v2, 3
-; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i64_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: v_mov_b32_e32 v1, 2
+; SDAG-NEXT: v_mov_b32_e32 v2, 3
+; SDAG-NEXT: v_mov_b32_e32 v3, 4
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i64_imm:
; GFX11: ; %bb.0:
@@ -1364,6 +1620,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i64_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i64_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GISEL-NEXT: v_mov_b32_e32 v2, 3
+; GISEL-NEXT: v_mov_b32_e32 v3, 4
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v2i64(<2 x i64> <i64 8589934593, i64 17179869187>)
ret void
}
@@ -1417,29 +1695,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v4, 1
-; GFX9-NEXT: v_mov_b32_e32 v5, 2
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], 0
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v4, 1
+; SDAG-NEXT: v_mov_b32_e32 v5, 2
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i64:
; GFX11: ; %bb.0:
@@ -1476,6 +1754,33 @@ define amdgpu_kernel void @test_call_external_void_func_v3i64() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v4, 1
+; GISEL-NEXT: v_mov_b32_e32 v5, 2
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%load = load <2 x i64>, ptr addrspace(1) null
%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 poison>, <3 x i32> <i32 0, i32 1, i32 2>
@@ -1536,31 +1841,31 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v4i64:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], 0
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v4, 1
-; GFX9-NEXT: v_mov_b32_e32 v5, 2
-; GFX9-NEXT: v_mov_b32_e32 v6, 3
-; GFX9-NEXT: v_mov_b32_e32 v7, 4
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i64:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], 0
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v4, 1
+; SDAG-NEXT: v_mov_b32_e32 v5, 2
+; SDAG-NEXT: v_mov_b32_e32 v6, 3
+; SDAG-NEXT: v_mov_b32_e32 v7, 4
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i64:
; GFX11: ; %bb.0:
@@ -1600,6 +1905,35 @@ define amdgpu_kernel void @test_call_external_void_func_v4i64() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i64:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v4, 1
+; GISEL-NEXT: v_mov_b32_e32 v5, 2
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: v_mov_b32_e32 v6, 3
+; GISEL-NEXT: v_mov_b32_e32 v7, 4
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%load = load <2 x i64>, ptr addrspace(1) null
%val = shufflevector <2 x i64> %load, <2 x i64> <i64 8589934593, i64 17179869187>, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @external_void_func_v4i64(<4 x i64> %val)
@@ -1645,24 +1979,24 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_f16_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f16_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x4400
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_f16_imm:
; GFX11-TRUE16: ; %bb.0:
@@ -1701,6 +2035,25 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f16_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f16@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x4400
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_f16(half 4.0)
ret void
}
@@ -1744,24 +2097,24 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_f32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 4.0
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 4.0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f32_imm:
; GFX11: ; %bb.0:
@@ -1789,6 +2142,25 @@ define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 4.0
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_f32(float 4.0)
ret void
}
@@ -1834,25 +2206,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2f32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f32_imm:
; GFX11: ; %bb.0:
@@ -1881,6 +2253,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
ret void
}
@@ -1928,26 +2320,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3f32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v2, 4.0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f32_imm:
; GFX11: ; %bb.0:
@@ -1978,6 +2370,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3f32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v2, 4.0
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3f32(<3 x float> <float 1.0, float 2.0, float 4.0>)
ret void
}
@@ -2029,28 +2442,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v5f32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: v_mov_b32_e32 v2, 4.0
-; GFX9-NEXT: v_mov_b32_e32 v3, -1.0
-; GFX9-NEXT: v_mov_b32_e32 v4, 0.5
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v5f32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1.0
+; SDAG-NEXT: v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v2, 4.0
+; SDAG-NEXT: v_mov_b32_e32 v3, -1.0
+; SDAG-NEXT: v_mov_b32_e32 v4, 0.5
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v5f32_imm:
; GFX11: ; %bb.0:
@@ -2084,6 +2497,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5f32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v5f32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5f32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5f32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v2, 4.0
+; GISEL-NEXT: v_mov_b32_e32 v3, -1.0
+; GISEL-NEXT: v_mov_b32_e32 v4, 0.5
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v5f32(<5 x float> <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>)
ret void
}
@@ -2129,25 +2565,25 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_f64_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_f64_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0x40100000
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_f64_imm:
; GFX11: ; %bb.0:
@@ -2176,6 +2612,26 @@ define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_f64_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_f64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_f64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_f64(double 4.0)
ret void
}
@@ -2225,27 +2681,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2f64_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f64_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0x40100000
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f64_imm:
; GFX11: ; %bb.0:
@@ -2277,6 +2733,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2f64_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f64_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v2f64(<2 x double> <double 2.0, double 4.0>)
ret void
}
@@ -2330,29 +2808,29 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3f64_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0
-; GFX9-NEXT: v_mov_b32_e32 v1, 2.0
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000
-; GFX9-NEXT: v_mov_b32_e32 v4, 0
-; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f64_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 2.0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0x40100000
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: v_mov_b32_e32 v5, 0x40200000
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f64_imm:
; GFX11: ; %bb.0:
@@ -2387,6 +2865,30 @@ define amdgpu_kernel void @test_call_external_void_func_v3f64_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f64_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f64@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f64@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 2.0
+; GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GISEL-NEXT: v_mov_b32_e32 v4, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, 0x40200000
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3f64(<3 x double> <double 2.0, double 4.0, double 8.0>)
ret void
}
@@ -2436,26 +2938,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i16:
; GFX11: ; %bb.0:
@@ -2487,6 +2989,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i16@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <2 x i16>, ptr addrspace(1) poison
call void @external_void_func_v2i16(<2 x i16> %val)
ret void
@@ -2539,26 +3062,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i16:
; GFX11: ; %bb.0:
@@ -2590,6 +3113,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <3 x i16>, ptr addrspace(1) poison
call void @external_void_func_v3i16(<3 x i16> %val)
ret void
@@ -2643,26 +3188,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f16:
; GFX11: ; %bb.0:
@@ -2694,6 +3239,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <3 x half>, ptr addrspace(1) poison
call void @external_void_func_v3f16(<3 x half> %val)
ret void
@@ -2741,25 +3308,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3i16_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
-; GFX9-NEXT: v_mov_b32_e32 v1, 3
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i16_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001
+; SDAG-NEXT: v_mov_b32_e32 v1, 3
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i16_imm:
; GFX11: ; %bb.0:
@@ -2788,6 +3355,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i16_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i16@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001
+; GISEL-NEXT: v_mov_b32_e32 v1, 3
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
ret void
}
@@ -2834,25 +3421,25 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3f16_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3f16_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x40003c00
+; SDAG-NEXT: v_mov_b32_e32 v1, 0x4400
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3f16_imm:
; GFX11: ; %bb.0:
@@ -2882,6 +3469,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3f16_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3f16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3f16@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x40003c00
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x4400
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
ret void
}
@@ -2934,26 +3541,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v4i16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i16:
; GFX11: ; %bb.0:
@@ -2985,6 +3592,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <4 x i16>, ptr addrspace(1) poison
call void @external_void_func_v4i16(<4 x i16> %val)
ret void
@@ -3033,25 +3662,25 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v4i16_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001
-; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i16_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0x20001
+; SDAG-NEXT: v_mov_b32_e32 v1, 0x40003
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i16_imm:
; GFX11: ; %bb.0:
@@ -3081,6 +3710,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i16_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i16@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x20001
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x40003
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
ret void
}
@@ -3132,26 +3781,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2f16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2f16:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dword v0, off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2f16:
; GFX11: ; %bb.0:
@@ -3183,6 +3832,27 @@ define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2f16:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_load_dword s8, s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2f16@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2f16@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <2 x half>, ptr addrspace(1) poison
call void @external_void_func_v2f16(<2 x half> %val)
ret void
@@ -3231,26 +3901,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i32:
; GFX11: ; %bb.0:
@@ -3282,6 +3952,28 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <2 x i32>, ptr addrspace(1) poison
call void @external_void_func_v2i32(<2 x i32> %val)
ret void
@@ -3328,25 +4020,25 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v2i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v2i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: v_mov_b32_e32 v1, 2
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v2i32_imm:
; GFX11: ; %bb.0:
@@ -3375,6 +4067,26 @@ define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v2i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v2i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v2i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v2i32(<2 x i32> <i32 1, i32 2>)
ret void
}
@@ -3422,26 +4134,26 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-NEXT: v_mov_b32_e32 v1, 4
-; GFX9-NEXT: v_mov_b32_e32 v2, 5
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v1, 4
+; SDAG-NEXT: v_mov_b32_e32 v2, 5
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i32_imm:
; GFX11: ; %bb.0:
@@ -3472,6 +4184,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 3
+; GISEL-NEXT: v_mov_b32_e32 v1, 4
+; GISEL-NEXT: v_mov_b32_e32 v2, 5
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3i32(<3 x i32> <i32 3, i32 4, i32 5>)
ret void
}
@@ -3521,27 +4254,27 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v3i32_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-NEXT: v_mov_b32_e32 v1, 4
-; GFX9-NEXT: v_mov_b32_e32 v2, 5
-; GFX9-NEXT: v_mov_b32_e32 v3, 6
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v3i32_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v1, 4
+; SDAG-NEXT: v_mov_b32_e32 v2, 5
+; SDAG-NEXT: v_mov_b32_e32 v3, 6
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v3i32_i32:
; GFX11: ; %bb.0:
@@ -3573,6 +4306,28 @@ define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v3i32_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v3i32_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v3i32_i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 3
+; GISEL-NEXT: v_mov_b32_e32 v1, 4
+; GISEL-NEXT: v_mov_b32_e32 v2, 5
+; GISEL-NEXT: v_mov_b32_e32 v3, 6
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
ret void
}
@@ -3620,26 +4375,26 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v4i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i32:
; GFX11: ; %bb.0:
@@ -3671,6 +4426,30 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GISEL-NEXT: v_mov_b32_e32 v3, s3
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = load <4 x i32>, ptr addrspace(1) poison
call void @external_void_func_v4i32(<4 x i32> %val)
ret void
@@ -3721,27 +4500,27 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v4i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: v_mov_b32_e32 v2, 3
-; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v4i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: v_mov_b32_e32 v1, 2
+; SDAG-NEXT: v_mov_b32_e32 v2, 3
+; SDAG-NEXT: v_mov_b32_e32 v3, 4
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v4i32_imm:
; GFX11: ; %bb.0:
@@ -3773,6 +4552,28 @@ define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v4i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v4i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v4i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GISEL-NEXT: v_mov_b32_e32 v2, 3
+; GISEL-NEXT: v_mov_b32_e32 v3, 4
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v4i32(<4 x i32> <i32 1, i32 2, i32 3, i32 4>)
ret void
}
@@ -3824,28 +4625,28 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v5i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: v_mov_b32_e32 v2, 3
-; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: v_mov_b32_e32 v4, 5
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v5i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: v_mov_b32_e32 v1, 2
+; SDAG-NEXT: v_mov_b32_e32 v2, 3
+; SDAG-NEXT: v_mov_b32_e32 v3, 4
+; SDAG-NEXT: v_mov_b32_e32 v4, 5
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v5i32_imm:
; GFX11: ; %bb.0:
@@ -3879,6 +4680,29 @@ define amdgpu_kernel void @test_call_external_void_func_v5i32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v5i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v5i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v5i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GISEL-NEXT: v_mov_b32_e32 v2, 3
+; GISEL-NEXT: v_mov_b32_e32 v3, 4
+; GISEL-NEXT: v_mov_b32_e32 v4, 5
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v5i32(<5 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5>)
ret void
}
@@ -3932,29 +4756,29 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v8i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v8i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v8i32:
; GFX11: ; %bb.0:
@@ -3993,6 +4817,36 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v8i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx8 s[8:15], s[0:1], 0x0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: v_mov_b32_e32 v4, s12
+; GISEL-NEXT: v_mov_b32_e32 v5, s13
+; GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GISEL-NEXT: v_mov_b32_e32 v7, s15
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) poison
%val = load <8 x i32>, ptr addrspace(1) %ptr
call void @external_void_func_v8i32(<8 x i32> %val)
@@ -4052,31 +4906,31 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v8i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: v_mov_b32_e32 v0, 1
-; GFX9-NEXT: v_mov_b32_e32 v1, 2
-; GFX9-NEXT: v_mov_b32_e32 v2, 3
-; GFX9-NEXT: v_mov_b32_e32 v3, 4
-; GFX9-NEXT: v_mov_b32_e32 v4, 5
-; GFX9-NEXT: v_mov_b32_e32 v5, 6
-; GFX9-NEXT: v_mov_b32_e32 v6, 7
-; GFX9-NEXT: v_mov_b32_e32 v7, 8
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v8i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: v_mov_b32_e32 v0, 1
+; SDAG-NEXT: v_mov_b32_e32 v1, 2
+; SDAG-NEXT: v_mov_b32_e32 v2, 3
+; SDAG-NEXT: v_mov_b32_e32 v3, 4
+; SDAG-NEXT: v_mov_b32_e32 v4, 5
+; SDAG-NEXT: v_mov_b32_e32 v5, 6
+; SDAG-NEXT: v_mov_b32_e32 v6, 7
+; SDAG-NEXT: v_mov_b32_e32 v7, 8
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v8i32_imm:
; GFX11: ; %bb.0:
@@ -4114,6 +4968,32 @@ define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v8i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v8i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v8i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 1
+; GISEL-NEXT: v_mov_b32_e32 v1, 2
+; GISEL-NEXT: v_mov_b32_e32 v2, 3
+; GISEL-NEXT: v_mov_b32_e32 v3, 4
+; GISEL-NEXT: v_mov_b32_e32 v4, 5
+; GISEL-NEXT: v_mov_b32_e32 v5, 6
+; GISEL-NEXT: v_mov_b32_e32 v6, 7
+; GISEL-NEXT: v_mov_b32_e32 v7, 8
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
call void @external_void_func_v8i32(<8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>)
ret void
}
@@ -4171,31 +5051,31 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v16i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
-; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
-; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v16i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
+; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[0:3], 0 offset:48
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v16i32:
; GFX11: ; %bb.0:
@@ -4238,6 +5118,44 @@ define amdgpu_kernel void @test_call_external_void_func_v16i32() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v16i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i32@rel32@hi+12
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: v_mov_b32_e32 v4, s12
+; GISEL-NEXT: v_mov_b32_e32 v5, s13
+; GISEL-NEXT: v_mov_b32_e32 v6, s14
+; GISEL-NEXT: v_mov_b32_e32 v7, s15
+; GISEL-NEXT: v_mov_b32_e32 v8, s16
+; GISEL-NEXT: v_mov_b32_e32 v9, s17
+; GISEL-NEXT: v_mov_b32_e32 v10, s18
+; GISEL-NEXT: v_mov_b32_e32 v11, s19
+; GISEL-NEXT: v_mov_b32_e32 v12, s20
+; GISEL-NEXT: v_mov_b32_e32 v13, s21
+; GISEL-NEXT: v_mov_b32_e32 v14, s22
+; GISEL-NEXT: v_mov_b32_e32 v15, s23
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) poison
%val = load <16 x i32>, ptr addrspace(1) %ptr
call void @external_void_func_v16i32(<16 x i32> %val)
@@ -4309,37 +5227,37 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[8:9]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v32i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[8:9]
-; GFX9-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(6)
-; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v32i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SDAG-NEXT: s_mov_b32 s6, -1
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_getpc_b64 s[8:9]
+; SDAG-NEXT: s_add_u32 s8, s8, external_void_func_v32i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s9, s9, external_void_func_v32i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_waitcnt vmcnt(6)
+; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v32i32:
; GFX11: ; %bb.0:
@@ -4394,6 +5312,62 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32() #0 {
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT: s_swappc_b64 s[30:31], s[12:13]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v32i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s54, -1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x40
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s55, 0xe00000
+; GISEL-NEXT: s_add_u32 s52, s52, s3
+; GISEL-NEXT: s_addc_u32 s53, s53, 0
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s23
+; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT: v_mov_b32_e32 v0, s36
+; GISEL-NEXT: v_mov_b32_e32 v1, s37
+; GISEL-NEXT: v_mov_b32_e32 v2, s38
+; GISEL-NEXT: v_mov_b32_e32 v3, s39
+; GISEL-NEXT: v_mov_b32_e32 v4, s40
+; GISEL-NEXT: v_mov_b32_e32 v5, s41
+; GISEL-NEXT: v_mov_b32_e32 v6, s42
+; GISEL-NEXT: v_mov_b32_e32 v7, s43
+; GISEL-NEXT: v_mov_b32_e32 v8, s44
+; GISEL-NEXT: v_mov_b32_e32 v9, s45
+; GISEL-NEXT: v_mov_b32_e32 v10, s46
+; GISEL-NEXT: v_mov_b32_e32 v11, s47
+; GISEL-NEXT: v_mov_b32_e32 v12, s48
+; GISEL-NEXT: v_mov_b32_e32 v13, s49
+; GISEL-NEXT: v_mov_b32_e32 v14, s50
+; GISEL-NEXT: v_mov_b32_e32 v15, s51
+; GISEL-NEXT: v_mov_b32_e32 v16, s8
+; GISEL-NEXT: v_mov_b32_e32 v17, s9
+; GISEL-NEXT: v_mov_b32_e32 v18, s10
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT: v_mov_b32_e32 v20, s12
+; GISEL-NEXT: v_mov_b32_e32 v21, s13
+; GISEL-NEXT: v_mov_b32_e32 v22, s14
+; GISEL-NEXT: v_mov_b32_e32 v23, s15
+; GISEL-NEXT: v_mov_b32_e32 v24, s16
+; GISEL-NEXT: v_mov_b32_e32 v25, s17
+; GISEL-NEXT: v_mov_b32_e32 v26, s18
+; GISEL-NEXT: v_mov_b32_e32 v27, s19
+; GISEL-NEXT: v_mov_b32_e32 v28, s20
+; GISEL-NEXT: v_mov_b32_e32 v29, s21
+; GISEL-NEXT: v_mov_b32_e32 v30, s22
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) poison
%val = load <32 x i32>, ptr addrspace(1) %ptr
call void @external_void_func_v32i32(<32 x i32> %val)
@@ -4471,40 +5445,40 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v32i32_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s7, 0xf000
-; GFX9-NEXT: s_mov_b32 s6, -1
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v32, off, s[4:7], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
-; GFX9-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
-; GFX9-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
-; GFX9-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
-; GFX9-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
-; GFX9-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
-; GFX9-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(8)
-; GFX9-NEXT: buffer_store_dword v31, off, s[36:39], s32
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v32i32_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s7, 0xf000
+; SDAG-NEXT: s_mov_b32 s6, -1
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v32, off, s[4:7], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[28:31], off, s[4:7], 0 offset:112
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0
+; SDAG-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:16
+; SDAG-NEXT: buffer_load_dwordx4 v[8:11], off, s[4:7], 0 offset:32
+; SDAG-NEXT: buffer_load_dwordx4 v[12:15], off, s[4:7], 0 offset:48
+; SDAG-NEXT: buffer_load_dwordx4 v[16:19], off, s[4:7], 0 offset:64
+; SDAG-NEXT: buffer_load_dwordx4 v[20:23], off, s[4:7], 0 offset:80
+; SDAG-NEXT: buffer_load_dwordx4 v[24:27], off, s[4:7], 0 offset:96
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_waitcnt vmcnt(8)
+; SDAG-NEXT: buffer_store_dword v32, off, s[36:39], s32 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(8)
+; SDAG-NEXT: buffer_store_dword v31, off, s[36:39], s32
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v32i32_i32:
; GFX11: ; %bb.0:
@@ -4566,6 +5540,67 @@ define amdgpu_kernel void @test_call_external_void_func_v32i32_i32(i32) #0 {
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v32i32_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s54, -1
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[0:1], 0x40
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[0:1], 0x0
+; GISEL-NEXT: s_load_dword s2, s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s55, 0xe00000
+; GISEL-NEXT: s_add_u32 s52, s52, s5
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_addc_u32 s53, s53, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1
+; GISEL-NEXT: ; kill: killed $sgpr0_sgpr1
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v0, s23
+; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT: v_mov_b32_e32 v0, s36
+; GISEL-NEXT: v_mov_b32_e32 v1, s37
+; GISEL-NEXT: v_mov_b32_e32 v2, s38
+; GISEL-NEXT: v_mov_b32_e32 v3, s39
+; GISEL-NEXT: v_mov_b32_e32 v4, s40
+; GISEL-NEXT: v_mov_b32_e32 v5, s41
+; GISEL-NEXT: v_mov_b32_e32 v6, s42
+; GISEL-NEXT: v_mov_b32_e32 v7, s43
+; GISEL-NEXT: v_mov_b32_e32 v8, s44
+; GISEL-NEXT: v_mov_b32_e32 v9, s45
+; GISEL-NEXT: v_mov_b32_e32 v10, s46
+; GISEL-NEXT: v_mov_b32_e32 v11, s47
+; GISEL-NEXT: v_mov_b32_e32 v12, s48
+; GISEL-NEXT: v_mov_b32_e32 v13, s49
+; GISEL-NEXT: v_mov_b32_e32 v14, s50
+; GISEL-NEXT: v_mov_b32_e32 v15, s51
+; GISEL-NEXT: v_mov_b32_e32 v16, s8
+; GISEL-NEXT: v_mov_b32_e32 v17, s9
+; GISEL-NEXT: v_mov_b32_e32 v18, s10
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT: v_mov_b32_e32 v20, s12
+; GISEL-NEXT: v_mov_b32_e32 v21, s13
+; GISEL-NEXT: v_mov_b32_e32 v22, s14
+; GISEL-NEXT: v_mov_b32_e32 v23, s15
+; GISEL-NEXT: v_mov_b32_e32 v24, s16
+; GISEL-NEXT: v_mov_b32_e32 v25, s17
+; GISEL-NEXT: v_mov_b32_e32 v26, s18
+; GISEL-NEXT: v_mov_b32_e32 v27, s19
+; GISEL-NEXT: v_mov_b32_e32 v28, s20
+; GISEL-NEXT: v_mov_b32_e32 v29, s21
+; GISEL-NEXT: v_mov_b32_e32 v30, s22
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
%val0 = load <32 x i32>, ptr addrspace(1) %ptr0
%val1 = load i32, ptr addrspace(1) poison
@@ -4622,29 +5657,29 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_i32_func_i32_imm:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s50, -1
-; GFX9-NEXT: s_mov_b32 s51, 0xe00000
-; GFX9-NEXT: s_add_u32 s48, s48, s5
-; GFX9-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24
-; GFX9-NEXT: s_addc_u32 s49, s49, 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
-; GFX9-NEXT: v_mov_b32_e32 v0, 42
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_mov_b32 s39, 0xf000
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_i32_func_i32_imm:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s50, -1
+; SDAG-NEXT: s_mov_b32 s51, 0xe00000
+; SDAG-NEXT: s_add_u32 s48, s48, s5
+; SDAG-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24
+; SDAG-NEXT: s_addc_u32 s49, s49, 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[48:49]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[50:51]
+; SDAG-NEXT: v_mov_b32_e32 v0, 42
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 s39, 0xf000
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_i32_func_i32_imm:
; GFX11: ; %bb.0:
@@ -4682,6 +5717,30 @@ define amdgpu_kernel void @test_call_external_i32_func_i32_imm(ptr addrspace(1)
; HSA-NEXT: buffer_store_dword v0, off, s[36:39], 0
; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_i32_func_i32_imm:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s50, -1
+; GISEL-NEXT: s_mov_b32 s51, 0xe00000
+; GISEL-NEXT: s_add_u32 s48, s48, s5
+; GISEL-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x24
+; GISEL-NEXT: s_addc_u32 s49, s49, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_i32_func_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_i32_func_i32@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 42
+; GISEL-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xf000
+; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call i32 @external_i32_func_i32(i32 42)
store volatile i32 %val, ptr addrspace(1) %out
ret void
@@ -4736,29 +5795,29 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_struct_i8_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
-; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_struct_i8_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[0:3], 0
+; SDAG-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_struct_i8_i32:
; GFX11: ; %bb.0:
@@ -4797,6 +5856,30 @@ define amdgpu_kernel void @test_call_external_void_func_struct_i8_i32() #0 {
; HSA-NEXT: s_mov_b32 s32, 0
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_struct_i8_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[8:9]
+; GISEL-NEXT: s_add_u32 s8, s8, external_void_func_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s9, s9, external_void_func_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: s_endpgm
%ptr0 = load ptr addrspace(1), ptr addrspace(4) poison
%val = load { i8, i32 }, ptr addrspace(1) %ptr0
call void @external_void_func_struct_i8_i32({ i8, i32 } %val)
@@ -4860,34 +5943,34 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_byval_struct_i8_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_movk_i32 s32, 0x400
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 3
+; SDAG-NEXT: buffer_store_byte v0, off, s[36:39], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 8
+; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_movk_i32 s32, 0x400
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v1, off, s[36:39], s32
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_byval_struct_i8_i32:
; GFX11-TRUE16: ; %bb.0:
@@ -4948,6 +6031,35 @@ define amdgpu_kernel void @test_call_external_void_func_byval_struct_i8_i32() #0
; HSA-NEXT: buffer_store_dword v1, off, s[0:3], s32
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_byval_struct_i8_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 3
+; GISEL-NEXT: buffer_store_byte v0, off, s[36:39], 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 8
+; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_movk_i32 s32, 0x400
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_byval_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_byval_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: buffer_store_dword v1, off, s[36:39], s32 offset:4
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%val = alloca { i8, i32 }, align 8, addrspace(5)
%gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 0
%gep1 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %val, i32 0, i32 1
@@ -5034,44 +6146,44 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s5
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 3
-; GFX9-NEXT: buffer_store_byte v0, off, s[36:39], 0
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
-; GFX9-NEXT: s_nop 0
-; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0
-; GFX9-NEXT: s_movk_i32 s32, 0x800
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v1, off, s[36:39], s32
-; GFX9-NEXT: v_mov_b32_e32 v0, 8
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8
-; GFX9-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s5
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 3
+; SDAG-NEXT: buffer_store_byte v0, off, s[36:39], 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 8
+; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0
+; SDAG-NEXT: s_movk_i32 s32, 0x800
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v0, off, s[36:39], s32 offset:4
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v1, off, s[36:39], s32
+; SDAG-NEXT: v_mov_b32_e32 v0, 8
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; SDAG-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
;
; GFX11-TRUE16-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
; GFX11-TRUE16: ; %bb.0:
@@ -5170,6 +6282,45 @@ define amdgpu_kernel void @test_call_external_void_func_sret_struct_i8_i32_byval
; HSA-NEXT: buffer_store_dword v1, off, s[4:7], 0
; HSA-NEXT: s_waitcnt vmcnt(0)
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_sret_struct_i8_i32_byval_struct_i8_i32:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s5
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 3
+; GISEL-NEXT: buffer_store_byte v0, off, s[36:39], 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 8
+; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], 0 offset:4
+; GISEL-NEXT: buffer_load_dword v0, off, s[36:39], 0
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:4
+; GISEL-NEXT: s_movk_i32 s32, 0x800
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: buffer_store_dword v0, off, s[36:39], s32
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: buffer_store_dword v1, off, s[36:39], s32 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v0, 8
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: buffer_load_ubyte v0, off, s[36:39], 0 offset:8
+; GISEL-NEXT: buffer_load_dword v1, off, s[36:39], 0 offset:12
+; GISEL-NEXT: s_mov_b32 s2, -1
+; GISEL-NEXT: s_mov_b32 s3, 0xf000
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: buffer_store_byte v0, off, s[0:3], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_dword v1, off, s[0:3], 0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%in.val = alloca { i8, i32 }, align 8, addrspace(5)
%out.val = alloca { i8, i32 }, align 8, addrspace(5)
%in.gep0 = getelementptr inbounds { i8, i32 }, ptr addrspace(5) %in.val, i32 0, i32 0
@@ -5272,47 +6423,47 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: test_call_external_void_func_v16i8:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
-; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s38, -1
-; GFX9-NEXT: s_mov_b32 s39, 0xe00000
-; GFX9-NEXT: s_add_u32 s36, s36, s3
-; GFX9-NEXT: s_mov_b32 s3, 0xf000
-; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
-; GFX9-NEXT: s_addc_u32 s37, s37, 0
-; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_waitcnt vmcnt(0)
-; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v17, 16, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v18, 24, v0
-; GFX9-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX9-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GFX9-NEXT: v_lshrrev_b32_e32 v13, 8, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v14, 16, v3
-; GFX9-NEXT: v_lshrrev_b32_e32 v15, 24, v3
-; GFX9-NEXT: v_mov_b32_e32 v4, v1
-; GFX9-NEXT: v_mov_b32_e32 v8, v2
-; GFX9-NEXT: v_mov_b32_e32 v12, v3
-; GFX9-NEXT: v_mov_b32_e32 v1, v16
-; GFX9-NEXT: v_mov_b32_e32 v2, v17
-; GFX9-NEXT: v_mov_b32_e32 v3, v18
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: test_call_external_void_func_v16i8:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; SDAG-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s38, -1
+; SDAG-NEXT: s_mov_b32 s39, 0xe00000
+; SDAG-NEXT: s_add_u32 s36, s36, s3
+; SDAG-NEXT: s_mov_b32 s3, 0xf000
+; SDAG-NEXT: s_mov_b32 s2, -1
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
+; SDAG-NEXT: s_addc_u32 s37, s37, 0
+; SDAG-NEXT: s_mov_b64 s[0:1], s[36:37]
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[38:39]
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 8, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v17, 16, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 24, v0
+; SDAG-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; SDAG-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; SDAG-NEXT: v_lshrrev_b32_e32 v13, 8, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v14, 16, v3
+; SDAG-NEXT: v_lshrrev_b32_e32 v15, 24, v3
+; SDAG-NEXT: v_mov_b32_e32 v4, v1
+; SDAG-NEXT: v_mov_b32_e32 v8, v2
+; SDAG-NEXT: v_mov_b32_e32 v12, v3
+; SDAG-NEXT: v_mov_b32_e32 v1, v16
+; SDAG-NEXT: v_mov_b32_e32 v2, v17
+; SDAG-NEXT: v_mov_b32_e32 v3, v18
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: test_call_external_void_func_v16i8:
; GFX11: ; %bb.0:
@@ -5384,6 +6535,56 @@ define amdgpu_kernel void @test_call_external_void_func_v16i8() #0 {
; HSA-NEXT: v_mov_b32_e32 v3, v18
; HSA-NEXT: s_swappc_b64 s[30:31], s[8:9]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_call_external_void_func_v16i8:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GISEL-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s38, -1
+; GISEL-NEXT: s_mov_b32 s39, 0xe00000
+; GISEL-NEXT: s_add_u32 s36, s36, s3
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; GISEL-NEXT: s_addc_u32 s37, s37, 0
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_lshr_b32 s8, s0, 8
+; GISEL-NEXT: s_lshr_b32 s9, s0, 16
+; GISEL-NEXT: s_lshr_b32 s10, s0, 24
+; GISEL-NEXT: s_lshr_b32 s11, s1, 8
+; GISEL-NEXT: s_lshr_b32 s12, s1, 16
+; GISEL-NEXT: s_lshr_b32 s13, s1, 24
+; GISEL-NEXT: s_lshr_b32 s14, s2, 8
+; GISEL-NEXT: s_lshr_b32 s15, s2, 16
+; GISEL-NEXT: s_lshr_b32 s16, s2, 24
+; GISEL-NEXT: s_lshr_b32 s17, s3, 8
+; GISEL-NEXT: s_lshr_b32 s18, s3, 16
+; GISEL-NEXT: s_lshr_b32 s19, s3, 24
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: v_mov_b32_e32 v4, s1
+; GISEL-NEXT: v_mov_b32_e32 v8, s2
+; GISEL-NEXT: v_mov_b32_e32 v12, s3
+; GISEL-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, external_void_func_v16i8@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, external_void_func_v16i8@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v1, s8
+; GISEL-NEXT: v_mov_b32_e32 v2, s9
+; GISEL-NEXT: v_mov_b32_e32 v3, s10
+; GISEL-NEXT: v_mov_b32_e32 v5, s11
+; GISEL-NEXT: v_mov_b32_e32 v6, s12
+; GISEL-NEXT: v_mov_b32_e32 v7, s13
+; GISEL-NEXT: v_mov_b32_e32 v9, s14
+; GISEL-NEXT: v_mov_b32_e32 v10, s15
+; GISEL-NEXT: v_mov_b32_e32 v11, s16
+; GISEL-NEXT: v_mov_b32_e32 v13, s17
+; GISEL-NEXT: v_mov_b32_e32 v14, s18
+; GISEL-NEXT: v_mov_b32_e32 v15, s19
+; GISEL-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
%ptr = load ptr addrspace(1), ptr addrspace(4) poison
%val = load <16 x i8>, ptr addrspace(1) %ptr
call void @external_void_func_v16i8(<16 x i8> %val)
@@ -5509,64 +6710,64 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
;
-; GFX9-LABEL: stack_passed_arg_alignment_v32i32_f64:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
-; GFX9-NEXT: s_mov_b32 s54, -1
-; GFX9-NEXT: s_mov_b32 s55, 0xe00000
-; GFX9-NEXT: s_add_u32 s52, s52, s5
-; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64
-; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4
-; GFX9-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24
-; GFX9-NEXT: s_mov_b32 s32, 0
-; GFX9-NEXT: s_addc_u32 s53, s53, 0
-; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s23
-; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32
-; GFX9-NEXT: v_mov_b32_e32 v0, s4
-; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
-; GFX9-NEXT: v_mov_b32_e32 v0, s5
-; GFX9-NEXT: s_mov_b64 s[6:7], s[0:1]
-; GFX9-NEXT: s_mov_b64 s[0:1], s[52:53]
-; GFX9-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
-; GFX9-NEXT: s_mov_b64 s[2:3], s[54:55]
-; GFX9-NEXT: v_mov_b32_e32 v0, s36
-; GFX9-NEXT: v_mov_b32_e32 v1, s37
-; GFX9-NEXT: v_mov_b32_e32 v2, s38
-; GFX9-NEXT: v_mov_b32_e32 v3, s39
-; GFX9-NEXT: v_mov_b32_e32 v4, s40
-; GFX9-NEXT: v_mov_b32_e32 v5, s41
-; GFX9-NEXT: v_mov_b32_e32 v6, s42
-; GFX9-NEXT: v_mov_b32_e32 v7, s43
-; GFX9-NEXT: v_mov_b32_e32 v8, s44
-; GFX9-NEXT: v_mov_b32_e32 v9, s45
-; GFX9-NEXT: v_mov_b32_e32 v10, s46
-; GFX9-NEXT: v_mov_b32_e32 v11, s47
-; GFX9-NEXT: v_mov_b32_e32 v12, s48
-; GFX9-NEXT: v_mov_b32_e32 v13, s49
-; GFX9-NEXT: v_mov_b32_e32 v14, s50
-; GFX9-NEXT: v_mov_b32_e32 v15, s51
-; GFX9-NEXT: v_mov_b32_e32 v16, s8
-; GFX9-NEXT: v_mov_b32_e32 v17, s9
-; GFX9-NEXT: v_mov_b32_e32 v18, s10
-; GFX9-NEXT: v_mov_b32_e32 v19, s11
-; GFX9-NEXT: v_mov_b32_e32 v20, s12
-; GFX9-NEXT: v_mov_b32_e32 v21, s13
-; GFX9-NEXT: v_mov_b32_e32 v22, s14
-; GFX9-NEXT: v_mov_b32_e32 v23, s15
-; GFX9-NEXT: v_mov_b32_e32 v24, s16
-; GFX9-NEXT: v_mov_b32_e32 v25, s17
-; GFX9-NEXT: v_mov_b32_e32 v26, s18
-; GFX9-NEXT: v_mov_b32_e32 v27, s19
-; GFX9-NEXT: v_mov_b32_e32 v28, s20
-; GFX9-NEXT: v_mov_b32_e32 v29, s21
-; GFX9-NEXT: v_mov_b32_e32 v30, s22
-; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT: s_endpgm
+; SDAG-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; SDAG-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; SDAG-NEXT: s_mov_b32 s54, -1
+; SDAG-NEXT: s_mov_b32 s55, 0xe00000
+; SDAG-NEXT: s_add_u32 s52, s52, s5
+; SDAG-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64
+; SDAG-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xa4
+; SDAG-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_addc_u32 s53, s53, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v0, s23
+; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32
+; SDAG-NEXT: v_mov_b32_e32 v0, s4
+; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
+; SDAG-NEXT: v_mov_b32_e32 v0, s5
+; SDAG-NEXT: s_mov_b64 s[6:7], s[0:1]
+; SDAG-NEXT: s_mov_b64 s[0:1], s[52:53]
+; SDAG-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[2:3], s[54:55]
+; SDAG-NEXT: v_mov_b32_e32 v0, s36
+; SDAG-NEXT: v_mov_b32_e32 v1, s37
+; SDAG-NEXT: v_mov_b32_e32 v2, s38
+; SDAG-NEXT: v_mov_b32_e32 v3, s39
+; SDAG-NEXT: v_mov_b32_e32 v4, s40
+; SDAG-NEXT: v_mov_b32_e32 v5, s41
+; SDAG-NEXT: v_mov_b32_e32 v6, s42
+; SDAG-NEXT: v_mov_b32_e32 v7, s43
+; SDAG-NEXT: v_mov_b32_e32 v8, s44
+; SDAG-NEXT: v_mov_b32_e32 v9, s45
+; SDAG-NEXT: v_mov_b32_e32 v10, s46
+; SDAG-NEXT: v_mov_b32_e32 v11, s47
+; SDAG-NEXT: v_mov_b32_e32 v12, s48
+; SDAG-NEXT: v_mov_b32_e32 v13, s49
+; SDAG-NEXT: v_mov_b32_e32 v14, s50
+; SDAG-NEXT: v_mov_b32_e32 v15, s51
+; SDAG-NEXT: v_mov_b32_e32 v16, s8
+; SDAG-NEXT: v_mov_b32_e32 v17, s9
+; SDAG-NEXT: v_mov_b32_e32 v18, s10
+; SDAG-NEXT: v_mov_b32_e32 v19, s11
+; SDAG-NEXT: v_mov_b32_e32 v20, s12
+; SDAG-NEXT: v_mov_b32_e32 v21, s13
+; SDAG-NEXT: v_mov_b32_e32 v22, s14
+; SDAG-NEXT: v_mov_b32_e32 v23, s15
+; SDAG-NEXT: v_mov_b32_e32 v24, s16
+; SDAG-NEXT: v_mov_b32_e32 v25, s17
+; SDAG-NEXT: v_mov_b32_e32 v26, s18
+; SDAG-NEXT: v_mov_b32_e32 v27, s19
+; SDAG-NEXT: v_mov_b32_e32 v28, s20
+; SDAG-NEXT: v_mov_b32_e32 v29, s21
+; SDAG-NEXT: v_mov_b32_e32 v30, s22
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; SDAG-NEXT: s_endpgm
;
; GFX11-LABEL: stack_passed_arg_alignment_v32i32_f64:
; GFX11: ; %bb.0: ; %entry
@@ -5662,6 +6863,65 @@ define amdgpu_kernel void @stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val
; HSA-NEXT: v_mov_b32_e32 v30, s22
; HSA-NEXT: s_swappc_b64 s[30:31], s[24:25]
; HSA-NEXT: s_endpgm
+;
+; GISEL-LABEL: stack_passed_arg_alignment_v32i32_f64:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0
+; GISEL-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1
+; GISEL-NEXT: s_mov_b32 s54, -1
+; GISEL-NEXT: s_mov_b64 s[6:7], s[0:1]
+; GISEL-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x64
+; GISEL-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xa4
+; GISEL-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x24
+; GISEL-NEXT: s_mov_b32 s55, 0xe00000
+; GISEL-NEXT: s_add_u32 s52, s52, s5
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_addc_u32 s53, s53, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v0, s23
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:4
+; GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GISEL-NEXT: s_mov_b64 s[0:1], s[52:53]
+; GISEL-NEXT: buffer_store_dword v0, off, s[52:55], s32 offset:8
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, stack_passed_f64_arg@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, stack_passed_f64_arg@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, s36
+; GISEL-NEXT: v_mov_b32_e32 v1, s37
+; GISEL-NEXT: v_mov_b32_e32 v2, s38
+; GISEL-NEXT: v_mov_b32_e32 v3, s39
+; GISEL-NEXT: v_mov_b32_e32 v4, s40
+; GISEL-NEXT: v_mov_b32_e32 v5, s41
+; GISEL-NEXT: v_mov_b32_e32 v6, s42
+; GISEL-NEXT: v_mov_b32_e32 v7, s43
+; GISEL-NEXT: v_mov_b32_e32 v8, s44
+; GISEL-NEXT: v_mov_b32_e32 v9, s45
+; GISEL-NEXT: v_mov_b32_e32 v10, s46
+; GISEL-NEXT: v_mov_b32_e32 v11, s47
+; GISEL-NEXT: v_mov_b32_e32 v12, s48
+; GISEL-NEXT: v_mov_b32_e32 v13, s49
+; GISEL-NEXT: v_mov_b32_e32 v14, s50
+; GISEL-NEXT: v_mov_b32_e32 v15, s51
+; GISEL-NEXT: v_mov_b32_e32 v16, s8
+; GISEL-NEXT: v_mov_b32_e32 v17, s9
+; GISEL-NEXT: v_mov_b32_e32 v18, s10
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: s_mov_b64 s[2:3], s[54:55]
+; GISEL-NEXT: v_mov_b32_e32 v20, s12
+; GISEL-NEXT: v_mov_b32_e32 v21, s13
+; GISEL-NEXT: v_mov_b32_e32 v22, s14
+; GISEL-NEXT: v_mov_b32_e32 v23, s15
+; GISEL-NEXT: v_mov_b32_e32 v24, s16
+; GISEL-NEXT: v_mov_b32_e32 v25, s17
+; GISEL-NEXT: v_mov_b32_e32 v26, s18
+; GISEL-NEXT: v_mov_b32_e32 v27, s19
+; GISEL-NEXT: v_mov_b32_e32 v28, s20
+; GISEL-NEXT: v_mov_b32_e32 v29, s21
+; GISEL-NEXT: v_mov_b32_e32 v30, s22
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GISEL-NEXT: s_endpgm
entry:
call void @stack_passed_f64_arg(<32 x i32> %val, double %tmp)
ret void
@@ -5702,22 +6962,22 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; CI-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
; CI-NEXT: s_setpc_b64 s[4:5]
;
-; GFX9-LABEL: tail_call_byval_align16:
-; GFX9: ; %bb.0: ; %entry
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
-; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32
-; GFX9-NEXT: s_getpc_b64 s[4:5]
-; GFX9-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
-; GFX9-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
-; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
-; GFX9-NEXT: s_waitcnt vmcnt(2)
-; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32
-; GFX9-NEXT: s_waitcnt vmcnt(1)
-; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
-; GFX9-NEXT: s_setpc_b64 s[4:5]
+; SDAG-LABEL: tail_call_byval_align16:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; SDAG-NEXT: buffer_load_dword v32, off, s[0:3], s32
+; SDAG-NEXT: s_getpc_b64 s[4:5]
+; SDAG-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
+; SDAG-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: buffer_store_dword v32, off, s[0:3], s32
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; SDAG-NEXT: s_setpc_b64 s[4:5]
;
; GFX11-LABEL: tail_call_byval_align16:
; GFX11: ; %bb.0: ; %entry
@@ -5749,6 +7009,23 @@ define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
; HSA-NEXT: s_waitcnt vmcnt(1)
; HSA-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
; HSA-NEXT: s_setpc_b64 s[4:5]
+;
+; GISEL-LABEL: tail_call_byval_align16:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; GISEL-NEXT: s_getpc_b64 s[4:5]
+; GISEL-NEXT: s_add_u32 s4, s4, byval_align16_f64_arg@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s5, s5, byval_align16_f64_arg@rel32@hi+12
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32
+; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:20
+; GISEL-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:24
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:16
+; GISEL-NEXT: s_setpc_b64 s[4:5]
entry:
%alloca = alloca double, align 8, addrspace(5)
tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
diff --git a/llvm/test/CodeGen/AMDGPU/call-c-function.ll b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
index e1bb3ea..4fbc727 100644
--- a/llvm/test/CodeGen/AMDGPU/call-c-function.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-c-function.ll
@@ -1,21 +1,68 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -enable-var-scope %s
+; RUN: llc -global-isel=0 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=SDAG -enable-var-scope %s
+; RUN: llc -global-isel=1 -stop-after=finalize-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -o - %s | FileCheck -check-prefix=GISEL -enable-var-scope %s
; Test that we don't explode on calls from shaders to functions with the C calling convention.
define amdgpu_ps void @amdgpu_ps_call_default_cc() {
- ; CHECK-LABEL: name: amdgpu_ps_call_default_cc
- ; CHECK: bb.0.main_body:
- ; CHECK-NEXT: S_ENDPGM 0
+ ; SDAG-LABEL: name: amdgpu_ps_call_default_cc
+ ; SDAG: bb.0.main_body:
+ ; SDAG-NEXT: S_ENDPGM 0
+ ;
+ ; GISEL-LABEL: name: amdgpu_ps_call_default_cc
+ ; GISEL: bb.1.main_body:
+ ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99
+ ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+ ; GISEL-NEXT: $sgpr4_sgpr5 = COPY [[DEF]]
+ ; GISEL-NEXT: $sgpr6_sgpr7 = COPY [[DEF]]
+ ; GISEL-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GISEL-NEXT: $sgpr8_sgpr9 = COPY [[S_MOV_B]]
+ ; GISEL-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]]
+ ; GISEL-NEXT: $sgpr12 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr13 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr14 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr15 = COPY [[DEF2]]
+ ; GISEL-NEXT: $vgpr31 = COPY [[DEF2]]
+ ; GISEL-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: S_ENDPGM 0
main_body:
call void null()
ret void
}
define amdgpu_gfx void @amdgpu_gfx_call_default_cc() {
- ; CHECK-LABEL: name: amdgpu_gfx_call_default_cc
- ; CHECK: bb.0.main_body:
- ; CHECK-NEXT: SI_RETURN
+ ; SDAG-LABEL: name: amdgpu_gfx_call_default_cc
+ ; SDAG: bb.0.main_body:
+ ; SDAG-NEXT: SI_RETURN
+ ;
+ ; GISEL-LABEL: name: amdgpu_gfx_call_default_cc
+ ; GISEL: bb.1.main_body:
+ ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+ ; GISEL-NEXT: $sgpr4_sgpr5 = COPY [[DEF]]
+ ; GISEL-NEXT: $sgpr6_sgpr7 = COPY [[DEF]]
+ ; GISEL-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GISEL-NEXT: $sgpr8_sgpr9 = COPY [[S_MOV_B]]
+ ; GISEL-NEXT: $sgpr10_sgpr11 = COPY [[DEF1]]
+ ; GISEL-NEXT: $sgpr12 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr13 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr14 = COPY [[DEF2]]
+ ; GISEL-NEXT: $sgpr15 = COPY [[DEF2]]
+ ; GISEL-NEXT: $vgpr31 = COPY [[DEF2]]
+ ; GISEL-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+ ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_MOV_B1]], 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31
+ ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: SI_RETURN
main_body:
call void null()
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
index 5f324df..fe0b018 100644
--- a/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-constexpr.ll
@@ -1,84 +1,341 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; GCN-LABEL: {{^}}test_bitcast_return_type_noinline:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ret_i32_noinline@rel32@hi+12
-; GCN: s_swappc_b64
define amdgpu_kernel void @test_bitcast_return_type_noinline() #0 {
+; SDAG-LABEL: test_bitcast_return_type_noinline:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_bitcast_return_type_noinline:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, ret_i32_noinline@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, ret_i32_noinline@rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call float @ret_i32_noinline()
%op = fadd float %val, 1.0
store volatile float %op, ptr addrspace(1) poison
ret void
}
-; GCN-LABEL: {{^}}test_bitcast_return_type_alwaysinline:
-; GCN: s_swappc_b64
define amdgpu_kernel void @test_bitcast_return_type_alwaysinline() #0 {
+; SDAG-LABEL: test_bitcast_return_type_alwaysinline:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_bitcast_return_type_alwaysinline:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, ret_i32_alwaysinline@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, ret_i32_alwaysinline@rel32@hi+12
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call float @ret_i32_alwaysinline()
%op = fadd float %val, 1.0
store volatile float %op, ptr addrspace(1) poison
ret void
}
-; GCN-LABEL: {{^}}test_bitcast_argument_type:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
define amdgpu_kernel void @test_bitcast_argument_type() #0 {
+; SDAG-LABEL: test_bitcast_argument_type:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_bitcast_argument_type:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call i32 @ident_i32(float 2.0)
%op = add i32 %val, 1
store volatile i32 %op, ptr addrspace(1) poison
ret void
}
-; GCN-LABEL: {{^}}test_bitcast_argument_and_return_types:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
define amdgpu_kernel void @test_bitcast_argument_and_return_types() #0 {
+; SDAG-LABEL: test_bitcast_argument_and_return_types:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_bitcast_argument_and_return_types:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call float @ident_i32(float 2.0)
%op = fadd float %val, 1.0
store volatile float %op, ptr addrspace(1) poison
ret void
}
-; GCN-LABEL: {{^}}use_workitem_id_x:
-; GCN: s_waitcnt
-; GCN-NEXT: v_and_b32_e32 [[TMP:v[0-9]+]], 0x3ff, v31
-; GCN-NEXT: v_add_i32_e32 v0, vcc, [[TMP]], v0
-; GCN-NEXT: s_setpc_b64
define hidden i32 @use_workitem_id_x(i32 %arg0) #3 {
+; GCN-LABEL: use_workitem_id_x:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_and_b32_e32 v1, 0x3ff, v31
+; GCN-NEXT: v_add_i32_e32 v0, vcc, v1, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%id = call i32 @llvm.amdgcn.workitem.id.x()
%op = add i32 %id, %arg0
ret i32 %op
}
-; GCN-LABEL: {{^}}test_bitcast_use_workitem_id_x:
-; GCN: v_mov_b32_e32 v31, v0
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, use_workitem_id_x@rel32@hi+12
-; GCN: v_mov_b32_e32 v0, 9
-; GCN: s_swappc_b64
-; GCN: v_add_f32_e32
define amdgpu_kernel void @test_bitcast_use_workitem_id_x() #3 {
+; SDAG-LABEL: test_bitcast_use_workitem_id_x:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: v_mov_b32_e32 v31, v0
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12
+; SDAG-NEXT: v_mov_b32_e32 v0, 9
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_bitcast_use_workitem_id_x:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_mov_b32_e32 v31, v0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, use_workitem_id_x@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, use_workitem_id_x@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 9
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = call float @use_workitem_id_x(i32 9)
%op = fadd float %val, 1.0
store volatile float %op, ptr addrspace(1) poison
ret void
}
-; GCN-LABEL: {{^}}test_invoke:
-; GCN: s_getpc_b64
-; GCN: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@lo+4
-; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, ident_i32@rel32@hi+12
-; GCN: s_swappc_b64
@_ZTIi = external global ptr
declare i32 @__gxx_personality_v0(...)
define amdgpu_kernel void @test_invoke() #0 personality ptr @__gxx_personality_v0 {
+; SDAG-LABEL: test_invoke:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v0, 2.0
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SDAG-NEXT: flat_store_dword v[0:1], v0
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_invoke:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, ident_i32@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, ident_i32@rel32@hi+12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: v_mov_b32_e32 v0, 2.0
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: v_add_f32_e32 v0, 1.0, v0
+; GISEL-NEXT: flat_store_dword v[0:1], v0
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_endpgm
%val = invoke float @ident_i32(float 2.0)
to label %continue unwind label %broken
@@ -96,14 +353,28 @@ continue:
; arguments before we lower any calls to them.
define hidden i32 @ret_i32_noinline() #0 {
+; GCN-LABEL: ret_i32_noinline:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 4
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret i32 4
}
define hidden i32 @ret_i32_alwaysinline() #1 {
+; GCN-LABEL: ret_i32_alwaysinline:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mov_b32_e32 v0, 4
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret i32 4
}
define hidden i32 @ident_i32(i32 %i) #0 {
+; GCN-LABEL: ident_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
ret i32 %i
}
diff --git a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
index ffe536d..4b5a49f 100644
--- a/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-defs-mode-register.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -simplify-mir -stop-after=finalize-isel < %s | FileCheck -check-prefixes=GISEL %s
; Check that call / asm get an implicit-def $mode added to them in
; strictfp functions.
@@ -7,46 +8,80 @@
declare protected void @maybe_defs_mode() #0
define float @call_changes_mode(float %x, float %y) #0 {
- ; CHECK-LABEL: name: call_changes_mode
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
- ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
- ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
- ; CHECK-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
- ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
- ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
- ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ ; SDAG-LABEL: name: call_changes_mode
+ ; SDAG: bb.0 (%ir-block.0):
+ ; SDAG-NEXT: liveins: $vgpr0, $vgpr1
+ ; SDAG-NEXT: {{ $}}
+ ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; SDAG-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; SDAG-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+ ; SDAG-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; SDAG-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+ ; SDAG-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $mode
+ ; SDAG-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; SDAG-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; SDAG-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; SDAG-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: call_changes_mode
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
+ ; GISEL-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc
+ ; GISEL-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GISEL-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
+ ; GISEL-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; GISEL-NEXT: SI_RETURN implicit $vgpr0
call void @maybe_defs_mode()
%val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
ret float %val
}
define void @tail_call_changes_mode() #0 {
- ; CHECK-LABEL: name: tail_call_changes_mode
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
- ; CHECK-NEXT: SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+ ; SDAG-LABEL: name: tail_call_changes_mode
+ ; SDAG: bb.0 (%ir-block.0):
+ ; SDAG-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def dead $scc
+ ; SDAG-NEXT: SI_TCRETURN killed [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit-def $mode
+ ;
+ ; GISEL-LABEL: name: tail_call_changes_mode
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3
+ ; GISEL-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY]]
+ ; GISEL-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:ccr_sgpr_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @maybe_defs_mode, target-flags(amdgpu-rel32-hi) @maybe_defs_mode, implicit-def $scc
+ ; GISEL-NEXT: SI_TCRETURN [[SI_PC_ADD_REL_OFFSET]], @maybe_defs_mode, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3
tail call void @maybe_defs_mode()
ret void
}
define float @asm_changes_mode(float %x, float %y) #0 {
- ; CHECK-LABEL: name: asm_changes_mode
- ; CHECK: bb.0 (%ir-block.0):
- ; CHECK-NEXT: liveins: $vgpr0, $vgpr1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; CHECK-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
- ; CHECK-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
- ; CHECK-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
- ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ ; SDAG-LABEL: name: asm_changes_mode
+ ; SDAG: bb.0 (%ir-block.0):
+ ; SDAG-NEXT: liveins: $vgpr0, $vgpr1
+ ; SDAG-NEXT: {{ $}}
+ ; SDAG-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; SDAG-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; SDAG-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+ ; SDAG-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+ ; SDAG-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; SDAG-NEXT: SI_RETURN implicit $vgpr0
+ ;
+ ; GISEL-LABEL: name: asm_changes_mode
+ ; GISEL: bb.1 (%ir-block.0):
+ ; GISEL-NEXT: liveins: $vgpr0, $vgpr1
+ ; GISEL-NEXT: {{ $}}
+ ; GISEL-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GISEL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GISEL-NEXT: INLINEASM &"; maybe defs mode", 1 /* sideeffect attdialect */, implicit-def $mode
+ ; GISEL-NEXT: [[V_ADD_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+ ; GISEL-NEXT: $vgpr0 = COPY [[V_ADD_F32_e64_]]
+ ; GISEL-NEXT: SI_RETURN implicit $vgpr0
call void asm sideeffect "; maybe defs mode", ""()
%val = call float @llvm.experimental.constrained.fadd.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.ignore")
ret float %val
diff --git a/llvm/test/CodeGen/AMDGPU/call-encoding.ll b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
index 6954c34..6c36c242 100644
--- a/llvm/test/CodeGen/AMDGPU/call-encoding.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-encoding.ll
@@ -1,5 +1,7 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=fiji -d - | FileCheck --check-prefix=GCN %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=gfx900 -d - | FileCheck --check-prefix=GCN %s
; XUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -filetype=obj < %s | llvm-objdump --triple=amdgcn--amdhsa --mcpu=hawaii -d - | FileCheck --check-prefixes=GCN,CI %s
; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
index 4df1049..b250227 100644
--- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -1,8 +1,13 @@
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN,CI %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/600/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -enable-ipra=0 | FileCheck -check-prefixes=GCN-V5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-NOBUG %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=iceland -enable-ipra=0 | FileCheck -check-prefixes=GCN,VI,VI-BUG %s
; Make sure to run a GPU with the SGPR allocation bug.
diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
index 61a195f..aed1079 100644
--- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll
@@ -1,8 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,SDAG %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 < %s | FileCheck -check-prefixes=GCN,MUBUF,GISEL %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-ipra=0 -mattr=+enable-flat-scratch < %s | FileCheck -check-prefixes=GCN,FLATSCR,GISEL %s
declare hidden void @external_void_func_void() #3
@@ -223,41 +227,6 @@ define hidden void @void_func_void_clobber_vcc() #2 {
}
define amdgpu_kernel void @test_call_void_func_void_clobber_vcc(ptr addrspace(1) %out) #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_vcc:
-; FLATSCR: ; %bb.0:
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT: s_add_u32 s8, s4, 8
-; FLATSCR-NEXT: s_addc_u32 s9, s5, 0
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT: s_mov_b32 s14, s12
-; FLATSCR-NEXT: s_mov_b32 s13, s11
-; FLATSCR-NEXT: s_mov_b32 s12, s10
-; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_vcc@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_vcc@rel32@hi+12
-; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT: s_mov_b32 s32, 0
-; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; def vcc
-; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: s_mov_b64 s[34:35], vcc
-; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc
-; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: s_mov_b64 vcc, s[34:35]
-; FLATSCR-NEXT: global_load_dword v0, v[0:1], off glc
-; FLATSCR-NEXT: s_waitcnt vmcnt(0)
-; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1
-; FLATSCR-NEXT: ; kill: killed $vgpr0_vgpr1
-; FLATSCR-NEXT: ;;#ASMSTART
-; FLATSCR-NEXT: ; use vcc
-; FLATSCR-NEXT: ;;#ASMEND
-; FLATSCR-NEXT: s_endpgm
%vcc = call i64 asm sideeffect "; def $0", "={vcc}"()
call void @void_func_void_clobber_vcc()
%val0 = load volatile i32, ptr addrspace(1) poison
@@ -463,51 +432,11 @@ define hidden void @void_func_void_clobber_s34() #2 {
}
define amdgpu_kernel void @test_call_void_func_void_clobber_s33() #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_s33:
-; FLATSCR: ; %bb.0:
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT: s_mov_b32 s14, s12
-; FLATSCR-NEXT: s_mov_b32 s13, s11
-; FLATSCR-NEXT: s_mov_b32 s12, s10
-; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5]
-; FLATSCR-NEXT: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s33@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s33@rel32@hi+12
-; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT: s_mov_b32 s32, 0
-; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT: s_endpgm
call void @void_func_void_clobber_s33()
ret void
}
define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
-; FLATSCR-LABEL: test_call_void_func_void_clobber_s34:
-; FLATSCR: ; %bb.0:
-; FLATSCR-NEXT: s_add_u32 flat_scratch_lo, s8, s13
-; FLATSCR-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; FLATSCR-NEXT: v_lshlrev_b32_e32 v1, 10, v1
-; FLATSCR-NEXT: s_mov_b32 s14, s12
-; FLATSCR-NEXT: s_mov_b32 s13, s11
-; FLATSCR-NEXT: s_mov_b32 s12, s10
-; FLATSCR-NEXT: s_mov_b64 s[10:11], s[6:7]
-; FLATSCR-NEXT: s_mov_b64 s[8:9], s[4:5]
-; FLATSCR-NEXT: s_getpc_b64 s[16:17]
-; FLATSCR-NEXT: s_add_u32 s16, s16, void_func_void_clobber_s34@rel32@lo+4
-; FLATSCR-NEXT: s_addc_u32 s17, s17, void_func_void_clobber_s34@rel32@hi+12
-; FLATSCR-NEXT: v_or3_b32 v31, v0, v1, v2
-; FLATSCR-NEXT: s_mov_b64 s[4:5], s[0:1]
-; FLATSCR-NEXT: s_mov_b64 s[6:7], s[2:3]
-; FLATSCR-NEXT: s_mov_b32 s32, 0
-; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; FLATSCR-NEXT: s_endpgm
call void @void_func_void_clobber_s34()
ret void
}
@@ -748,3 +677,6 @@ attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind noinline }
attributes #3 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-cluster-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-cluster-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-cluster-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/call-return-types.ll b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
index c0f74fd..21c3696 100644
--- a/llvm/test/CodeGen/AMDGPU/call-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-return-types.ll
@@ -1,7 +1,12 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX11 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GFX7 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GCN,GFX89 %s
+
+; Ideally, we would also like to test GlobalISel with gfx11 but we are currently blocked on llvm-project#166501.
declare void @external_void_func_void() #0
diff --git a/llvm/test/CodeGen/AMDGPU/call-skip.ll b/llvm/test/CodeGen/AMDGPU/call-skip.ll
index ea2bba1..e2ca278 100644
--- a/llvm/test/CodeGen/AMDGPU/call-skip.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-skip.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=0 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -global-isel=1 -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
; A call should be skipped if all lanes are zero, since we don't know
; what side effects should be avoided inside the call.
@@ -6,12 +8,37 @@ define hidden void @func() #1 {
ret void
}
-; GCN-LABEL: {{^}}if_call:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]]
-; GCN: s_swappc_b64
-; GCN: [[END]]:
define void @if_call(i32 %flag) #0 {
+; GCN-LABEL: if_call:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_mov_b32 s20, s33
+; GCN-NEXT: s_mov_b32 s33, s32
+; GCN-NEXT: s_xor_saveexec_b64 s[16:17], -1
+; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[16:17]
+; GCN-NEXT: v_writelane_b32 v1, s30, 0
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_addk_i32 s32, 0x400
+; GCN-NEXT: v_writelane_b32 v1, s31, 1
+; GCN-NEXT: s_and_saveexec_b64 s[16:17], vcc
+; GCN-NEXT: s_cbranch_execz .LBB1_2
+; GCN-NEXT: ; %bb.1: ; %call
+; GCN-NEXT: s_getpc_b64 s[18:19]
+; GCN-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
+; GCN-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
+; GCN-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GCN-NEXT: .LBB1_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[16:17]
+; GCN-NEXT: v_readlane_b32 s31, v1, 1
+; GCN-NEXT: v_readlane_b32 s30, v1, 0
+; GCN-NEXT: s_mov_b32 s32, s33
+; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: s_mov_b32 s33, s20
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
%cc = icmp eq i32 %flag, 0
br i1 %cc, label %call, label %end
@@ -23,12 +50,20 @@ end:
ret void
}
-; GCN-LABEL: {{^}}if_asm:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz [[END:.LBB[0-9]+_[0-9]+]]
-; GCN: ; sample asm
-; GCN: [[END]]:
define void @if_asm(i32 %flag) #0 {
+; GCN-LABEL: if_asm:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GCN-NEXT: s_cbranch_execz .LBB2_2
+; GCN-NEXT: ; %bb.1: ; %call
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ; sample asm
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: .LBB2_2: ; %end
+; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
+; GCN-NEXT: s_setpc_b64 s[30:31]
%cc = icmp eq i32 %flag, 0
br i1 %cc, label %call, label %end
@@ -40,11 +75,58 @@ end:
ret void
}
-; GCN-LABEL: {{^}}if_call_kernel:
-; GCN: s_and_saveexec_b64
-; GCN-NEXT: s_cbranch_execz .LBB3_2
-; GCN: s_swappc_b64
define amdgpu_kernel void @if_call_kernel() #0 {
+; SDAG-LABEL: if_call_kernel:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_add_i32 s12, s12, s17
+; SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SDAG-NEXT: s_add_u32 s0, s0, s17
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB3_2
+; SDAG-NEXT: ; %bb.1: ; %call
+; SDAG-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v1
+; SDAG-NEXT: s_getpc_b64 s[18:19]
+; SDAG-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
+; SDAG-NEXT: v_or_b32_e32 v31, v0, v2
+; SDAG-NEXT: s_mov_b32 s12, s14
+; SDAG-NEXT: s_mov_b32 s13, s15
+; SDAG-NEXT: s_mov_b32 s14, s16
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; SDAG-NEXT: .LBB3_2: ; %end
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: if_call_kernel:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_i32 s12, s12, s17
+; GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GISEL-NEXT: s_add_u32 s0, s0, s17
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB3_2
+; GISEL-NEXT: ; %bb.1: ; %call
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 20, v2
+; GISEL-NEXT: s_getpc_b64 s[18:19]
+; GISEL-NEXT: s_add_u32 s18, s18, func@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s19, s19, func@rel32@hi+12
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v1
+; GISEL-NEXT: s_mov_b32 s12, s14
+; GISEL-NEXT: s_mov_b32 s13, s15
+; GISEL-NEXT: s_mov_b32 s14, s16
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[18:19]
+; GISEL-NEXT: .LBB3_2: ; %end
+; GISEL-NEXT: s_endpgm
%id = call i32 @llvm.amdgcn.workitem.id.x()
%cc = icmp eq i32 %id, 0
br i1 %cc, label %call, label %end
diff --git a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
index 675acd0..a52942c 100644
--- a/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-waitcnt.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GISEL %s
; Load argument depends on waitcnt which should be skipped.
define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
@@ -27,24 +28,43 @@ define amdgpu_kernel void @call_memory_arg_load(ptr addrspace(3) %ptr, i32) #0 {
; Memory waitcnt with no register dependence on the call
define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_memory_no_dep:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT: s_add_u32 s0, s0, s11
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_getpc_b64 s[8:9]
-; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: global_store_dword v0, v0, s[6:7]
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT: s_endpgm
+; SDAG-LABEL: call_memory_no_dep:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT: s_add_u32 s0, s0, s11
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_getpc_b64 s[8:9]
+; SDAG-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: global_store_dword v0, v0, s[6:7]
+; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_memory_no_dep:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_load_dwordx2 s[6:7], s[6:7], 0x0
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s11
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_getpc_b64 s[8:9]
+; GISEL-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: global_store_dword v0, v0, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: s_endpgm
store i32 0, ptr addrspace(1) %ptr
call void @func(i32 0)
ret void
@@ -52,46 +72,82 @@ define amdgpu_kernel void @call_memory_no_dep(ptr addrspace(1) %ptr, i32) #0 {
; Should not wait after the call before memory
define amdgpu_kernel void @call_no_wait_after_call(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_no_wait_after_call:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s11
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_getpc_b64 s[8:9]
-; GCN-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: v_mov_b32_e32 v40, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT: global_store_dword v40, v40, s[34:35]
-; GCN-NEXT: s_endpgm
+; SDAG-LABEL: call_no_wait_after_call:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
+; SDAG-NEXT: s_add_u32 s0, s0, s11
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_getpc_b64 s[8:9]
+; SDAG-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT: global_store_dword v40, v40, s[34:35]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_no_wait_after_call:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
+; GISEL-NEXT: s_add_u32 s0, s0, s11
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_getpc_b64 s[8:9]
+; GISEL-NEXT: s_add_u32 s8, s8, func@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s9, s9, func@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: global_store_dword v0, v0, s[34:35]
+; GISEL-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, ptr addrspace(1) %ptr
ret void
}
define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_no_wait_after_call_return_val:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
-; GCN-NEXT: s_add_u32 s0, s0, s11
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_getpc_b64 s[8:9]
-; GCN-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
-; GCN-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: v_mov_b32_e32 v40, 0
-; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT: global_store_dword v40, v0, s[34:35]
-; GCN-NEXT: s_endpgm
+; SDAG-LABEL: call_no_wait_after_call_return_val:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
+; SDAG-NEXT: s_add_u32 s0, s0, s11
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_getpc_b64 s[8:9]
+; SDAG-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
+; SDAG-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
+; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: v_mov_b32_e32 v40, 0
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT: global_store_dword v40, v0, s[34:35]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_no_wait_after_call_return_val:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT: s_load_dwordx2 s[34:35], s[6:7], 0x0
+; GISEL-NEXT: s_add_u32 s0, s0, s11
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_getpc_b64 s[8:9]
+; GISEL-NEXT: s_add_u32 s8, s8, func.return@rel32@lo+4
+; GISEL-NEXT: s_addc_u32 s9, s9, func.return@rel32@hi+12
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: global_store_dword v1, v0, s[34:35]
+; GISEL-NEXT: s_endpgm
%rv = call i32 @func.return(i32 0)
store i32 %rv, ptr addrspace(1) %ptr
ret void
@@ -99,22 +155,39 @@ define amdgpu_kernel void @call_no_wait_after_call_return_val(ptr addrspace(1) %
; Need to wait for the address dependency
define amdgpu_kernel void @call_got_load(ptr addrspace(1) %ptr, i32) #0 {
-; GCN-LABEL: call_got_load:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 flat_scratch_lo, s8, s11
-; GCN-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
-; GCN-NEXT: s_add_u32 s0, s0, s11
-; GCN-NEXT: s_addc_u32 s1, s1, 0
-; GCN-NEXT: s_getpc_b64 s[6:7]
-; GCN-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
-; GCN-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
-; GCN-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
-; GCN-NEXT: s_mov_b64 s[6:7], s[4:5]
-; GCN-NEXT: v_mov_b32_e32 v0, 0
-; GCN-NEXT: s_mov_b32 s32, 0
-; GCN-NEXT: s_waitcnt lgkmcnt(0)
-; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GCN-NEXT: s_endpgm
+; SDAG-LABEL: call_got_load:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; SDAG-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; SDAG-NEXT: s_add_u32 s0, s0, s11
+; SDAG-NEXT: s_addc_u32 s1, s1, 0
+; SDAG-NEXT: s_getpc_b64 s[6:7]
+; SDAG-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
+; SDAG-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
+; SDAG-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; SDAG-NEXT: s_mov_b64 s[6:7], s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_mov_b32 s32, 0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: call_got_load:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_add_u32 flat_scratch_lo, s8, s11
+; GISEL-NEXT: s_addc_u32 flat_scratch_hi, s9, 0
+; GISEL-NEXT: s_add_u32 s0, s0, s11
+; GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GISEL-NEXT: s_getpc_b64 s[6:7]
+; GISEL-NEXT: s_add_u32 s6, s6, got.func@gotpcrel32@lo+4
+; GISEL-NEXT: s_addc_u32 s7, s7, got.func@gotpcrel32@hi+12
+; GISEL-NEXT: s_load_dwordx2 s[8:9], s[6:7], 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_mov_b32 s32, 0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: s_endpgm
call void @got.func(i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index b96de17..8d05317 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -702,8 +702,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_add_u32 s4, s4, s6
-; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT: s_or_b32 s6, s12, s13
; CISI-NEXT: s_addc_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
@@ -1674,8 +1672,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
; CISI-NEXT: s_mov_b32 s10, -1
; CISI-NEXT: s_waitcnt lgkmcnt(0)
; CISI-NEXT: s_sub_u32 s4, s4, s6
-; CISI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; CISI-NEXT: s_or_b32 s6, s12, s13
; CISI-NEXT: s_subb_u32 s5, s5, s7
; CISI-NEXT: s_mov_b32 s8, s0
; CISI-NEXT: s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index dbdea8e..71af21a1 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -12,8 +12,6 @@ define i32 @s_add_co_select_user() {
; GFX7-NEXT: s_load_dword s6, s[4:5], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_u32 s7, s6, s6
-; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GFX7-NEXT: s_or_b32 s4, s4, s5
; GFX7-NEXT: s_addc_u32 s8, s6, 0
; GFX7-NEXT: s_cselect_b64 s[4:5], -1, 0
; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], exec
@@ -88,15 +86,13 @@ bb:
define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
-; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT: s_load_dword s0, s[8:9], 0x0
; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
-; GFX7-NEXT: s_add_u32 s0, s2, s2
-; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GFX7-NEXT: s_or_b32 s0, s0, s1
-; GFX7-NEXT: s_addc_u32 s0, s2, 0
+; GFX7-NEXT: s_add_u32 s1, s0, s0
+; GFX7-NEXT: s_addc_u32 s0, s0, 0
; GFX7-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX7-NEXT: s_andn2_b64 vcc, exec, s[0:1]
; GFX7-NEXT: s_cbranch_vccnz .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
index b8f084d..db32135 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-wwm.ll
@@ -4,14 +4,24 @@
define amdgpu_gs i32 @main() {
; CHECK-LABEL: main:
; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_bitcmp1_b32 0, 0
; CHECK-NEXT: s_mov_b32 s0, 0
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; CHECK-NEXT: s_cselect_b32 s1, -1, 0
+; CHECK-NEXT: s_or_saveexec_b32 s2, -1
+; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s1, v0
+; CHECK-NEXT: s_mov_b32 exec_lo, s2
+; CHECK-NEXT: s_or_b32 s0, s0, s1
+; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_bitcmp1_b32 s0, 0
; CHECK-NEXT: s_cselect_b32 s0, -1, 0
+; CHECK-NEXT: s_wait_alu 0xfffe
; CHECK-NEXT: s_xor_b32 s0, s0, -1
-; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
-; CHECK-NEXT: v_readfirstlane_b32 s0, v0
+; CHECK-NEXT: s_wait_alu 0xfffe
+; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0
+; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT: v_readfirstlane_b32 s0, v1
; CHECK-NEXT: s_wait_alu 0xf1ff
; CHECK-NEXT: ; return to shader part epilog
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index 22bc62ac..679b289 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -1,8 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX90A-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefixes=VGPR,GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x4bf16.1k(<4 x i16>, <4 x i16>, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x4bf16.1k(<4 x i16>, <4 x i16>, <16 x float>, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 7e30af9..e7d7f87 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,NOLIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,LIT-SRCC,GFX908,GFX908_A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A,GFX908_A,GFX90A_42 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942,GFX90A_42 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck -enable-var-scope --check-prefix=GFX942-VGPR %s
declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32, i32, i32)
declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
@@ -3186,13 +3186,14 @@ define amdgpu_kernel void @test_mfma_i32_16x16x4i8_splatimm_src2_64(ptr addrspac
;
; GFX942-VGPR-LABEL: test_mfma_i32_16x16x4i8_splatimm_src2_64:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v16, v17, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_i32_16x16x4_4b_i8 v[0:15], v0, v1, 64 cbsz:1 abid:2 blgp:3
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4538,13 +4539,14 @@ define amdgpu_kernel void @test_mfma_f32_16x16x1f32_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_16x16x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v16, v17, 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_16x16x1_4b_f32 v[0:15], v0, v1, 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 8
+; GFX942-VGPR-NEXT: s_nop 7
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4689,15 +4691,16 @@ define amdgpu_kernel void @test_mfma_f32_32x32x8f16_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x8f16_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 0x3c003c00
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, v0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v2, 0x40004000
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v3, v2
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0x3c003c00
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v17, v16
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v18, 0x40004000
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v19, v18
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[16:17], v[18:19], 1.0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v16, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x8_f16 v[0:15], v[0:1], v[2:3], 1.0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942-VGPR-NEXT: s_nop 9
+; GFX942-VGPR-NEXT: s_nop 8
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
; GFX942-VGPR-NEXT: global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
@@ -4908,14 +4911,14 @@ define amdgpu_kernel void @test_mfma_f32_32x32x1f32_imm_splat(ptr addrspace(1) %
;
; GFX942-VGPR-LABEL: test_mfma_f32_32x32x1f32_imm_splat:
; GFX942-VGPR: ; %bb.0: ; %bb
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v0, 1.0
-; GFX942-VGPR-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 1.0
+; GFX942-VGPR-NEXT: v_mov_b32_e32 v33, 2.0
; GFX942-VGPR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-VGPR-NEXT: s_nop 0
+; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v32, v33, 0
; GFX942-VGPR-NEXT: v_mov_b32_e32 v32, 0
-; GFX942-VGPR-NEXT: v_mfma_f32_32x32x1_2b_f32 v[0:31], v0, v1, 0
; GFX942-VGPR-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-VGPR-NEXT: s_nop 15
-; GFX942-VGPR-NEXT: s_nop 0
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; GFX942-VGPR-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 02d2990..d1ba892 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -396,7 +396,8 @@ define amdgpu_kernel void @test_readfirstlane_imm_f64(ptr addrspace(1) %out) {
;
; CHECK-GISEL-LABEL: test_readfirstlane_imm_f64:
; CHECK-GISEL: ; %bb.0:
-; CHECK-GISEL-NEXT: s_mov_b64 s[0:1], 0x4040000000000000
+; CHECK-GISEL-NEXT: s_mov_b32 s0, 0
+; CHECK-GISEL-NEXT: s_mov_b32 s1, 0x40400000
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; use s[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
@@ -455,13 +456,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 32
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -488,13 +490,15 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
-; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, 0x40400000
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -584,17 +588,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
@@ -624,17 +628,17 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s3
-; CHECK-SDAG-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index 4bb6538..e330c72 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -verify-machineinstrs -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
; Check that we do not copy agprs to vgprs and back inside the loop.
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index fba42c4..fa452f3 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -2277,3 +2277,181 @@ body: |
S_ENDPGM 0
...
+
+---
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+ %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+---
+# Do not delete s_or_b32 because of intervening def of scc
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize_intervening
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub0
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub0:sreg_64_xexec
+ %41:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+# Do not delete s_or_b32 since both operands are sub1.
+name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_s_or_b32_s_cmp_lg_u32_0x00000000_cant_optimize
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sreg_32_xm0_xexec = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_CSELECT_B64_]].sub1
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 [[COPY1]], [[COPY2]], implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 1, 0, implicit $scc
+ %40:sreg_32_xm0_xexec = COPY %31.sub1:sreg_64_xexec
+ %41:sreg_32 = COPY %31.sub1:sreg_64_xexec
+ %sgpr4:sreg_32 = S_OR_B32 %40:sreg_32_xm0_xexec, %41:sreg_32, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
+
+---
+name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+body: |
+ ; GCN-LABEL: name: s_cselect_b64_undef_s_or_b32_s_cmp_lg_u32_0x00000000
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]]
+ ; GCN-NEXT: S_CMP_LG_U32 [[COPY]], 0, implicit-def $scc
+ ; GCN-NEXT: [[S_CSELECT_B64_:%[0-9]+]]:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ ; GCN-NEXT: %sgpr4:sreg_32 = S_OR_B32 undef %4:sreg_32_xm0_xexec, undef %5:sreg_32_xm0_xexec, implicit-def $scc
+ ; GCN-NEXT: S_CBRANCH_SCC0 %bb.2, implicit $scc
+ ; GCN-NEXT: S_BRANCH %bb.1
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: S_ENDPGM 0
+ bb.0:
+ successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ liveins: $sgpr0_sgpr1, $vgpr0_vgpr1
+ %0:vgpr_32 = IMPLICIT_DEF
+ %2:sreg_32 = COPY %0
+ S_CMP_LG_U32 %2, 0, implicit-def $scc
+ %31:sreg_64_xexec = S_CSELECT_B64 -1, 0, implicit $scc
+ %sgpr4:sreg_32 = S_OR_B32 undef %40:sreg_32_xm0_xexec, undef %41:sreg_32_xm0_xexec, implicit-def $scc
+ S_CMP_LG_U32 %sgpr4, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit $scc
+ S_BRANCH %bb.1
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ bb.2:
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
index 8803f3a..fc79916 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-vgpr-mfma-to-agpr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=gfx942 -amdgpu-mfma-vgpr-form < %s | FileCheck %s
target triple = "amdgcn-amd-amdhsa"
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 71f5a94..74a6d7f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -8,7 +8,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_ashr_i32 s8, s1, 31
; GCN-NEXT: s_add_u32 s0, s0, s8
@@ -17,8 +16,8 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_xor_b64 s[10:11], s[0:1], s[8:9]
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s10
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s11
-; GCN-NEXT: s_sub_u32 s12, 0, s10
-; GCN-NEXT: s_subb_u32 s13, 0, s11
+; GCN-NEXT: s_sub_u32 s0, 0, s10
+; GCN-NEXT: s_subb_u32 s1, 0, s11
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -27,128 +26,121 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s12, v0
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s17, v2
-; GCN-NEXT: s_mul_i32 s15, s13, s0
-; GCN-NEXT: s_mul_i32 s16, s12, s0
-; GCN-NEXT: s_add_i32 s1, s17, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s16
-; GCN-NEXT: s_add_i32 s1, s1, s15
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s16
-; GCN-NEXT: v_readfirstlane_b32 s15, v3
-; GCN-NEXT: s_mul_i32 s17, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s15, s15, s17
-; GCN-NEXT: v_readfirstlane_b32 s17, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s17
-; GCN-NEXT: s_mul_i32 s16, s14, s16
-; GCN-NEXT: v_readfirstlane_b32 s18, v4
-; GCN-NEXT: s_add_u32 s15, s15, s16
-; GCN-NEXT: s_addc_u32 s15, s17, s18
-; GCN-NEXT: v_readfirstlane_b32 s16, v1
-; GCN-NEXT: s_addc_u32 s16, s16, 0
-; GCN-NEXT: s_mul_i32 s1, s14, s1
-; GCN-NEXT: s_add_u32 s1, s15, s1
-; GCN-NEXT: s_addc_u32 s15, 0, s16
-; GCN-NEXT: s_add_u32 s16, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s16
-; GCN-NEXT: v_mul_hi_u32 v0, s12, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s14, s14, s15
-; GCN-NEXT: s_mul_i32 s0, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s13, s13, s16
-; GCN-NEXT: s_mul_i32 s1, s12, s16
-; GCN-NEXT: s_add_i32 s0, s0, s13
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s16, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s14, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s16, v0
-; GCN-NEXT: s_mul_i32 s13, s16, s0
-; GCN-NEXT: v_readfirstlane_b32 s17, v2
-; GCN-NEXT: s_add_u32 s13, s17, s13
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s1, s14, s1
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s12, v3
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s1, s15, s12
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: s_addc_u32 s12, s12, 0
-; GCN-NEXT: s_mul_i32 s0, s14, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: s_add_u32 s15, s16, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s14, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s13, s0, s12
+; GCN-NEXT: v_readfirstlane_b32 s16, v2
+; GCN-NEXT: s_mul_i32 s14, s1, s2
+; GCN-NEXT: s_mul_i32 s15, s0, s2
+; GCN-NEXT: s_add_i32 s13, s16, s13
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s15
+; GCN-NEXT: s_add_i32 s13, s13, s14
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s13
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s15
+; GCN-NEXT: v_readfirstlane_b32 s14, v3
+; GCN-NEXT: s_mul_i32 s16, s2, s13
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s13
+; GCN-NEXT: s_add_u32 s14, s14, s16
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s15, s12, s15
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: v_readfirstlane_b32 s17, v4
+; GCN-NEXT: s_add_u32 s14, s14, s15
+; GCN-NEXT: s_addc_u32 s14, s16, s17
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: s_addc_u32 s15, s15, 0
+; GCN-NEXT: s_mul_i32 s13, s12, s13
+; GCN-NEXT: s_add_u32 s13, s14, s13
+; GCN-NEXT: s_addc_u32 s14, 0, s15
+; GCN-NEXT: s_add_u32 s13, s2, s13
+; GCN-NEXT: v_mov_b32_e32 v0, s13
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s12, s12, s14
+; GCN-NEXT: s_mul_i32 s14, s0, s12
+; GCN-NEXT: s_mul_i32 s1, s1, s13
+; GCN-NEXT: v_readfirstlane_b32 s15, v0
+; GCN-NEXT: s_add_i32 s14, s15, s14
+; GCN-NEXT: s_mul_i32 s0, s0, s13
+; GCN-NEXT: s_add_i32 s1, s14, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s13, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
+; GCN-NEXT: s_mul_i32 s15, s13, s1
+; GCN-NEXT: v_readfirstlane_b32 s17, v2
+; GCN-NEXT: s_add_u32 s15, s17, s15
+; GCN-NEXT: v_readfirstlane_b32 s16, v0
+; GCN-NEXT: s_mul_i32 s0, s12, s0
+; GCN-NEXT: s_addc_u32 s16, 0, s16
+; GCN-NEXT: v_readfirstlane_b32 s14, v3
+; GCN-NEXT: s_add_u32 s0, s15, s0
+; GCN-NEXT: s_addc_u32 s0, s16, s14
+; GCN-NEXT: v_readfirstlane_b32 s14, v1
+; GCN-NEXT: s_addc_u32 s14, s14, 0
+; GCN-NEXT: s_mul_i32 s1, s12, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s14
+; GCN-NEXT: s_add_u32 s14, s13, s0
+; GCN-NEXT: s_addc_u32 s15, s12, s1
; GCN-NEXT: s_ashr_i32 s12, s7, 31
; GCN-NEXT: s_add_u32 s0, s6, s12
; GCN-NEXT: s_mov_b32 s13, s12
; GCN-NEXT: s_addc_u32 s1, s7, s12
; GCN-NEXT: s_xor_b64 s[6:7], s[0:1], s[12:13]
-; GCN-NEXT: v_mov_b32_e32 v0, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s15
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
-; GCN-NEXT: v_mov_b32_e32 v2, s15
+; GCN-NEXT: v_mov_b32_e32 v2, s14
; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
; GCN-NEXT: s_mov_b32 s0, s4
; GCN-NEXT: v_readfirstlane_b32 s4, v1
; GCN-NEXT: v_mul_hi_u32 v1, s7, v2
-; GCN-NEXT: s_mul_i32 s1, s6, s14
+; GCN-NEXT: s_mul_i32 s1, s6, s15
; GCN-NEXT: v_readfirstlane_b32 s16, v3
; GCN-NEXT: v_mul_hi_u32 v0, s7, v0
; GCN-NEXT: s_add_u32 s1, s16, s1
; GCN-NEXT: s_addc_u32 s4, 0, s4
-; GCN-NEXT: s_mul_i32 s15, s7, s15
+; GCN-NEXT: s_mul_i32 s14, s7, s14
; GCN-NEXT: v_readfirstlane_b32 s16, v1
-; GCN-NEXT: s_add_u32 s1, s1, s15
+; GCN-NEXT: s_add_u32 s1, s1, s14
; GCN-NEXT: s_addc_u32 s1, s4, s16
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_addc_u32 s4, s4, 0
-; GCN-NEXT: s_mul_i32 s14, s7, s14
-; GCN-NEXT: s_add_u32 s16, s1, s14
-; GCN-NEXT: v_mov_b32_e32 v0, s16
+; GCN-NEXT: s_mul_i32 s14, s7, s15
+; GCN-NEXT: s_add_u32 s14, s1, s14
+; GCN-NEXT: v_mov_b32_e32 v0, s14
; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_addc_u32 s17, 0, s4
+; GCN-NEXT: s_addc_u32 s15, 0, s4
; GCN-NEXT: s_mov_b32 s1, s5
-; GCN-NEXT: s_mul_i32 s4, s10, s17
+; GCN-NEXT: s_mul_i32 s4, s10, s15
; GCN-NEXT: v_readfirstlane_b32 s5, v0
; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s5, s11, s16
-; GCN-NEXT: s_add_i32 s18, s4, s5
-; GCN-NEXT: s_sub_i32 s14, s7, s18
-; GCN-NEXT: s_mul_i32 s4, s10, s16
+; GCN-NEXT: s_mul_i32 s5, s11, s14
+; GCN-NEXT: s_add_i32 s16, s4, s5
+; GCN-NEXT: s_sub_i32 s17, s7, s16
+; GCN-NEXT: s_mul_i32 s4, s10, s14
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s15, s4, s5
-; GCN-NEXT: s_subb_u32 s19, s14, s11
-; GCN-NEXT: s_sub_u32 s20, s6, s10
-; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT: s_or_b32 s14, s14, s15
-; GCN-NEXT: s_subb_u32 s14, s19, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s20, s10
+; GCN-NEXT: s_subb_u32 s17, s17, s11
+; GCN-NEXT: s_sub_u32 s18, s6, s10
+; GCN-NEXT: s_subb_u32 s17, s17, 0
+; GCN-NEXT: s_cmp_ge_u32 s17, s11
; GCN-NEXT: s_cselect_b32 s19, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s14, s11
-; GCN-NEXT: s_cselect_b32 s14, s19, s15
-; GCN-NEXT: s_add_u32 s15, s16, 1
-; GCN-NEXT: s_addc_u32 s19, s17, 0
-; GCN-NEXT: s_add_u32 s20, s16, 2
-; GCN-NEXT: s_addc_u32 s21, s17, 0
-; GCN-NEXT: s_cmp_lg_u32 s14, 0
-; GCN-NEXT: s_cselect_b32 s14, s20, s15
-; GCN-NEXT: s_cselect_b32 s15, s21, s19
+; GCN-NEXT: s_cmp_ge_u32 s18, s10
+; GCN-NEXT: s_cselect_b32 s18, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s17, s11
+; GCN-NEXT: s_cselect_b32 s17, s18, s19
+; GCN-NEXT: s_add_u32 s18, s14, 1
+; GCN-NEXT: s_addc_u32 s19, s15, 0
+; GCN-NEXT: s_add_u32 s20, s14, 2
+; GCN-NEXT: s_addc_u32 s21, s15, 0
+; GCN-NEXT: s_cmp_lg_u32 s17, 0
+; GCN-NEXT: s_cselect_b32 s17, s20, s18
+; GCN-NEXT: s_cselect_b32 s18, s21, s19
; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_subb_u32 s4, s7, s18
+; GCN-NEXT: s_subb_u32 s4, s7, s16
; GCN-NEXT: s_cmp_ge_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s5, -1, 0
; GCN-NEXT: s_cmp_ge_u32 s6, s10
@@ -156,13 +148,14 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_eq_u32 s4, s11
; GCN-NEXT: s_cselect_b32 s4, s6, s5
; GCN-NEXT: s_cmp_lg_u32 s4, 0
-; GCN-NEXT: s_cselect_b32 s5, s15, s17
-; GCN-NEXT: s_cselect_b32 s4, s14, s16
+; GCN-NEXT: s_cselect_b32 s5, s18, s15
+; GCN-NEXT: s_cselect_b32 s4, s17, s14
; GCN-NEXT: s_xor_b64 s[6:7], s[12:13], s[8:9]
; GCN-NEXT: s_xor_b64 s[4:5], s[4:5], s[6:7]
; GCN-NEXT: s_sub_u32 s4, s4, s6
; GCN-NEXT: s_subb_u32 s5, s5, s7
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GCN-NEXT: s_endpgm
@@ -202,8 +195,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s18, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s10, s10, s11
; GCN-IR-NEXT: s_addc_u32 s10, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s16, 63, s16
@@ -235,8 +226,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_sub_u32 s16, s16, s20
; GCN-IR-NEXT: s_subb_u32 s17, s17, s21
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT: s_or_b32 s20, s20, s21
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[8:9]
@@ -1150,8 +1139,7 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s6
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s7
; GCN-NEXT: s_sub_u32 s2, 0, s6
-; GCN-NEXT: s_subb_u32 s10, 0, s7
-; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_subb_u32 s8, 0, s7
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1161,115 +1149,109 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_mul_i32 s9, s2, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v2
-; GCN-NEXT: s_mul_i32 s12, s10, s8
-; GCN-NEXT: s_mul_i32 s13, s2, s8
-; GCN-NEXT: s_add_i32 s9, s14, s9
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
-; GCN-NEXT: s_add_i32 s9, s9, s12
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v3
-; GCN-NEXT: s_mul_i32 s15, s8, s9
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s9
-; GCN-NEXT: s_add_u32 s12, s12, s15
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s13, s11, s13
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s14, v4
-; GCN-NEXT: s_add_u32 s12, s12, s13
-; GCN-NEXT: s_addc_u32 s12, s15, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v1
-; GCN-NEXT: s_addc_u32 s13, s13, 0
-; GCN-NEXT: s_mul_i32 s9, s11, s9
-; GCN-NEXT: s_add_u32 s9, s12, s9
-; GCN-NEXT: s_addc_u32 s12, 0, s13
-; GCN-NEXT: s_add_u32 s13, s8, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s13
+; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: s_mul_i32 s10, s2, s9
+; GCN-NEXT: v_readfirstlane_b32 s13, v2
+; GCN-NEXT: s_mul_i32 s11, s8, s3
+; GCN-NEXT: s_mul_i32 s12, s2, s3
+; GCN-NEXT: s_add_i32 s10, s13, s10
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s12
+; GCN-NEXT: s_add_i32 s10, s10, s11
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s10
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v3
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s10
+; GCN-NEXT: s_mul_i32 s14, s3, s10
+; GCN-NEXT: s_add_u32 s11, s11, s14
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s12, s9, s12
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v4
+; GCN-NEXT: s_add_u32 s11, s11, s12
+; GCN-NEXT: v_readfirstlane_b32 s15, v1
+; GCN-NEXT: s_addc_u32 s11, s14, s13
+; GCN-NEXT: s_addc_u32 s12, s15, 0
+; GCN-NEXT: s_mul_i32 s10, s9, s10
+; GCN-NEXT: s_add_u32 s10, s11, s10
+; GCN-NEXT: s_addc_u32 s11, 0, s12
+; GCN-NEXT: s_add_u32 s10, s3, s10
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s11, s11, s12
-; GCN-NEXT: s_mul_i32 s8, s2, s11
-; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s8, s9, s8
-; GCN-NEXT: s_mul_i32 s10, s10, s13
-; GCN-NEXT: s_mul_i32 s2, s2, s13
-; GCN-NEXT: s_add_i32 s8, s8, s10
+; GCN-NEXT: s_addc_u32 s9, s9, s11
+; GCN-NEXT: s_mul_i32 s11, s2, s9
+; GCN-NEXT: s_mul_i32 s8, s8, s10
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_add_i32 s11, s12, s11
+; GCN-NEXT: s_mul_i32 s2, s2, s10
+; GCN-NEXT: s_add_i32 s8, s11, s8
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mul_hi_u32 v3, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s13, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s11, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s13, v0
-; GCN-NEXT: s_mul_i32 s10, s13, s8
+; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT: s_mul_i32 s12, s10, s8
; GCN-NEXT: v_readfirstlane_b32 s14, v2
-; GCN-NEXT: s_add_u32 s10, s14, s10
-; GCN-NEXT: v_readfirstlane_b32 s12, v0
-; GCN-NEXT: s_mul_i32 s2, s11, s2
-; GCN-NEXT: s_addc_u32 s12, 0, s12
-; GCN-NEXT: v_readfirstlane_b32 s9, v3
-; GCN-NEXT: s_add_u32 s2, s10, s2
-; GCN-NEXT: s_addc_u32 s2, s12, s9
-; GCN-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NEXT: s_addc_u32 s9, s9, 0
-; GCN-NEXT: s_mul_i32 s8, s11, s8
+; GCN-NEXT: s_add_u32 s12, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s13, v0
+; GCN-NEXT: s_mul_i32 s2, s9, s2
+; GCN-NEXT: s_addc_u32 s13, 0, s13
+; GCN-NEXT: v_readfirstlane_b32 s11, v3
+; GCN-NEXT: s_add_u32 s2, s12, s2
+; GCN-NEXT: s_addc_u32 s2, s13, s11
+; GCN-NEXT: v_readfirstlane_b32 s11, v1
+; GCN-NEXT: s_addc_u32 s11, s11, 0
+; GCN-NEXT: s_mul_i32 s8, s9, s8
; GCN-NEXT: s_add_u32 s2, s2, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s9
-; GCN-NEXT: s_add_u32 s2, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s8, s11, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s11
+; GCN-NEXT: s_add_u32 s2, s10, s2
+; GCN-NEXT: s_addc_u32 s8, s9, s8
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s8, 24
; GCN-NEXT: s_mul_i32 s8, s8, 24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_readfirstlane_b32 s10, v1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
; GCN-NEXT: s_add_u32 s8, s10, s8
-; GCN-NEXT: s_addc_u32 s12, 0, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s12
+; GCN-NEXT: s_addc_u32 s10, 0, s9
+; GCN-NEXT: v_mov_b32_e32 v0, s10
; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_mul_i32 s8, s7, s12
+; GCN-NEXT: s_mul_i32 s8, s7, s10
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s13, s9, s8
-; GCN-NEXT: s_sub_i32 s10, 0, s13
-; GCN-NEXT: s_mul_i32 s8, s6, s12
-; GCN-NEXT: s_sub_u32 s14, 24, s8
+; GCN-NEXT: s_add_i32 s11, s9, s8
+; GCN-NEXT: s_sub_i32 s12, 0, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s10
+; GCN-NEXT: s_sub_u32 s13, 24, s8
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s11, s8, s9
-; GCN-NEXT: s_subb_u32 s15, s10, s7
-; GCN-NEXT: s_sub_u32 s16, s14, s6
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s15, 0
-; GCN-NEXT: s_cmp_ge_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s11, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s16, s6
+; GCN-NEXT: s_subb_u32 s12, s12, s7
+; GCN-NEXT: s_sub_u32 s14, s13, s6
+; GCN-NEXT: s_subb_u32 s12, s12, 0
+; GCN-NEXT: s_cmp_ge_u32 s12, s7
; GCN-NEXT: s_cselect_b32 s15, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s10, s7
-; GCN-NEXT: s_cselect_b32 s10, s15, s11
-; GCN-NEXT: s_add_u32 s11, s12, 1
+; GCN-NEXT: s_cmp_ge_u32 s14, s6
+; GCN-NEXT: s_cselect_b32 s14, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s12, s7
+; GCN-NEXT: s_cselect_b32 s12, s14, s15
+; GCN-NEXT: s_add_u32 s14, s10, 1
; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_add_u32 s16, s12, 2
+; GCN-NEXT: s_add_u32 s16, s10, 2
; GCN-NEXT: s_addc_u32 s17, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s10, 0
-; GCN-NEXT: s_cselect_b32 s10, s16, s11
-; GCN-NEXT: s_cselect_b32 s11, s17, s15
+; GCN-NEXT: s_cmp_lg_u32 s12, 0
+; GCN-NEXT: s_cselect_b32 s12, s16, s14
+; GCN-NEXT: s_cselect_b32 s14, s17, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, 0, s13
+; GCN-NEXT: s_subb_u32 s8, 0, s11
; GCN-NEXT: s_cmp_ge_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s6
+; GCN-NEXT: s_cmp_ge_u32 s13, s6
; GCN-NEXT: s_cselect_b32 s6, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s8, s7
; GCN-NEXT: s_cselect_b32 s6, s6, s9
; GCN-NEXT: s_cmp_lg_u32 s6, 0
-; GCN-NEXT: s_cselect_b32 s7, s11, 0
-; GCN-NEXT: s_cselect_b32 s6, s10, s12
+; GCN-NEXT: s_cselect_b32 s7, s14, 0
+; GCN-NEXT: s_cselect_b32 s6, s12, s10
; GCN-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5]
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_subb_u32 s7, s7, s4
@@ -1303,8 +1285,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s12, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s10, 63, s10
@@ -1335,8 +1315,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
index 9f4a6f2..3e507a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shlN_add.ll
+++ b/llvm/test/CodeGen/AMDGPU/shlN_add.ll
@@ -1,4 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GCN,GFX10-SDAG %s
+
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s
; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
@@ -7,6 +12,24 @@
; Test gfx9+ s_shl[1-4]_add_u32 pattern matching
define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl1_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl1_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl1_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl1_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s1
@@ -28,6 +51,24 @@ define amdgpu_ps i32 @s_shl1_add_u32(i32 inreg %src0, i32 inreg %src1) {
}
define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl2_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl2_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl2_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl2_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s1
@@ -49,6 +90,24 @@ define amdgpu_ps i32 @s_shl2_add_u32(i32 inreg %src0, i32 inreg %src1) {
}
define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl3_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl3_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl3_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl3_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s1
@@ -70,6 +129,24 @@ define amdgpu_ps i32 @s_shl3_add_u32(i32 inreg %src0, i32 inreg %src1) {
}
define amdgpu_ps i32 @s_shl4_add_u32(i32 inreg %src0, i32 inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl4_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl4_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl4_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s1
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl4_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s1
@@ -102,6 +179,25 @@ define amdgpu_ps i32 @s_shl5_add_u32(i32 inreg %src0, i32 inreg %src1) {
}
define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl1_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl1_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl1_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 1, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_shl1_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -126,6 +222,25 @@ define i32 @v_shl1_add_u32(i32 %src0, i32 %src1) {
}
define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl2_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl2_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl2_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 2, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_shl2_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -150,6 +265,25 @@ define i32 @v_shl2_add_u32(i32 %src0, i32 %src1) {
}
define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl3_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl3_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl3_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 3, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_shl3_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -174,6 +308,25 @@ define i32 @v_shl3_add_u32(i32 %src0, i32 %src1) {
}
define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl4_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 4, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl4_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl4_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 4, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_shl4_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -198,6 +351,25 @@ define i32 @v_shl4_add_u32(i32 %src0, i32 %src1) {
}
define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: v_shl5_add_u32:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, v0, 5, v1
+; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-SDAG-LABEL: v_shl5_add_u32:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-SDAG-NEXT: v_lshlrev_b32_e32 v0, 5, v0
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: v_shl5_add_u32:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, v0, 5, v1
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
; GFX9-LABEL: v_shl5_add_u32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -224,6 +396,22 @@ define i32 @v_shl5_add_u32(i32 %src0, i32 %src1) {
; FIXME: Use v_lshl_add_u32
; shift is scalar, but add is vector.
define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 1, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl1_add_u32_vgpr1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 1, v0
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: shl1_add_u32_vgpr1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b32 s0, s0, 1
@@ -248,6 +436,22 @@ define amdgpu_ps float @shl1_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
}
define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 2, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl2_add_u32_vgpr1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 2, v0
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: shl2_add_u32_vgpr1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b32 s0, s0, 2
@@ -272,6 +476,22 @@ define amdgpu_ps float @shl2_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
}
define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 3, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl3_add_u32_vgpr1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 3, v0
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: shl3_add_u32_vgpr1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b32 s0, s0, 3
@@ -296,6 +516,22 @@ define amdgpu_ps float @shl3_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
}
define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 4, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl4_add_u32_vgpr1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 4, v0
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: shl4_add_u32_vgpr1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b32 s0, s0, 4
@@ -320,6 +556,22 @@ define amdgpu_ps float @shl4_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
}
define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
+; GFX9-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, v0
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 5
+; GFX8-SDAG-NEXT: v_add_u32_e32 v0, vcc, s0, v0
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: shl5_add_u32_vgpr1:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: v_lshl_add_u32 v0, s0, 5, v0
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: shl5_add_u32_vgpr1:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl_b32 s0, s0, 5
@@ -344,6 +596,30 @@ define amdgpu_ps float @shl5_add_u32_vgpr1(i32 inreg %src0, i32 %src1) {
}
define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 1
+; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 1
+; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl1_add_u32_v2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 1
+; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 1
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl1_add_u32_v2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl1_add_u32 s0, s0, s2
@@ -369,6 +645,30 @@ define amdgpu_ps <2 x i32> @s_shl1_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
}
define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl2_add_u32_v2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 2
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl2_add_u32_v2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2
@@ -394,6 +694,30 @@ define amdgpu_ps <2 x i32> @s_shl2_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
}
define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 3
+; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 3
+; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl3_add_u32_v2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 3
+; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 3
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl3_add_u32_v2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl3_add_u32 s0, s0, s2
@@ -419,6 +743,30 @@ define amdgpu_ps <2 x i32> @s_shl3_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
}
define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl4_add_u32_v2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 4
+; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl4_add_u32_v2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl4_add_u32 s0, s0, s2
@@ -444,6 +792,30 @@ define amdgpu_ps <2 x i32> @s_shl4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> i
}
define amdgpu_ps <2 x i32> @s_shl_2_4_add_u32_v2(<2 x i32> inreg %src0, <2 x i32> inreg %src1) {
+; GFX9-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX9-SDAG: ; %bb.0:
+; GFX9-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX9-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX9-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX9-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX9-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX8-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX8-SDAG: ; %bb.0:
+; GFX8-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX8-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX8-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX8-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX8-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX10-SDAG-LABEL: s_shl_2_4_add_u32_v2:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_lshl_b32 s0, s0, 2
+; GFX10-SDAG-NEXT: s_lshl_b32 s1, s1, 4
+; GFX10-SDAG-NEXT: s_add_i32 s0, s0, s2
+; GFX10-SDAG-NEXT: s_add_i32 s1, s1, s3
+; GFX10-SDAG-NEXT: ; return to shader part epilog
+;
; GFX9-LABEL: s_shl_2_4_add_u32_v2:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_lshl2_add_u32 s0, s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
index ef96944..586579f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll
@@ -20,33 +20,38 @@ define void @test() {
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
; CHECK-NEXT: .LBB0_3: ; %bb.3
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
+; CHECK-NEXT: ; implicit-def: $sgpr4
+; CHECK-NEXT: v_mov_b32_e32 v0, s4
+; CHECK-NEXT: v_readfirstlane_b32 s6, v0
; CHECK-NEXT: s_mov_b64 s[4:5], -1
+; CHECK-NEXT: s_mov_b32 s7, 0
+; CHECK-NEXT: s_cmp_eq_u32 s6, s7
; CHECK-NEXT: ; implicit-def: $vgpr1 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
-; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_mov_b64 s[10:11], exec
+; CHECK-NEXT: s_mov_b64 exec, -1
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_cbranch_scc1 .LBB0_5
; CHECK-NEXT: ; %bb.4: ; %bb.4
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: s_mov_b64 s[4:5], 0
; CHECK-NEXT: v_writelane_b32 v1, s4, 0
; CHECK-NEXT: v_writelane_b32 v1, s5, 1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_write_b32 a0, v1 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: .LBB0_5: ; %Flow
; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1
+; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: v_accvgpr_read_b32 v1, a0 ; Reload Reuse
-; CHECK-NEXT: s_mov_b64 exec, s[8:9]
+; CHECK-NEXT: s_mov_b64 exec, s[10:11]
; CHECK-NEXT: v_readlane_b32 s4, v1, 0
; CHECK-NEXT: v_readlane_b32 s5, v1, 1
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 364598f..5aafb0f 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -31,8 +31,8 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
; CHECK-NEXT: [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
; CHECK-NEXT: undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %117:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: KILL undef %117:sgpr_128
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %125:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: KILL undef %125:sgpr_128
; CHECK-NEXT: [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
; CHECK-NEXT: [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
@@ -44,85 +44,87 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.71, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 64, 0 :: (invariant load (s128) from %ir.88, addrspace 4)
; CHECK-NEXT: KILL undef %74:sreg_64
; CHECK-NEXT: KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
- ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %112:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %87:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL undef %112:sgpr_128
- ; CHECK-NEXT: KILL undef %87:sgpr_128
+ ; CHECK-NEXT: KILL undef %89:sgpr_128
+ ; CHECK-NEXT: KILL undef %118:sgpr_128
; CHECK-NEXT: [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
; CHECK-NEXT: undef [[S_ADD_U32_1:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_1:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_1]], implicit-def $scc
; CHECK-NEXT: [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %148:sreg_32, 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %148:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.77, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.83, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+ ; CHECK-NEXT: undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_1]], 64, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 64, 0 :: (invariant load (s128) from %ir.93, addrspace 4)
; CHECK-NEXT: KILL [[S_ADD_U32_1]].sub0, [[S_ADD_U32_1]].sub1
- ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %148:sreg_32, implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: KILL [[S_ADD_U32_2]].sub0, [[S_ADD_U32_2]].sub1
+ ; CHECK-NEXT: [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %169:sreg_32, 31, implicit-def dead $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %169:sreg_32, implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_9:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_9:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %279:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %302:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %334:sgpr_128, undef %335:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %345:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.95, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.100, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.105, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %329:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %340:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %357:sgpr_128, undef %358:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %368:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 64, 0 :: (invariant load (s128) from %ir.99, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 64, 0 :: (invariant load (s128) from %ir.107, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.112, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.124, addrspace 4)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %352:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %363:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_13:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_13:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
- ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
- ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %361:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+ ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %384:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 224, 0 :: (invariant load (s128) from %ir.117, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.133, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 576, 0 :: (invariant load (s128) from %ir.138, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.129, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 224, 0 :: (invariant load (s128) from %ir.122, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 576, 0 :: (invariant load (s128) from %ir.150, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 224, 0 :: (invariant load (s128) from %ir.128, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 224, 0 :: (invariant load (s128) from %ir.134, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.162, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 224, 0 :: (invariant load (s128) from %ir.140, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ADD_I32_7:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM4]], -217, implicit-def dead $scc
@@ -133,49 +135,49 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY2]], [[S_LSHL_B32_4]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %36:sreg_32, [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
; CHECK-NEXT: [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_17]], 168, 0 :: (invariant load (s32) from %ir.260, align 8, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 576, 0 :: (invariant load (s128) from %ir.145, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.158, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.170, addrspace 4)
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
; CHECK-NEXT: [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_9]], 0, 0 :: (invariant load (s128) from %ir.166, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.171, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.178, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.183, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s64) from %ir.269, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.193, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.199, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4)
; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.204, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.209, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.216, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.221, addrspace 4)
; CHECK-NEXT: [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
; CHECK-NEXT: [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
@@ -187,30 +189,30 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.280, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4)
; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 160, 0 :: (invariant load (s128) from %ir.244, addrspace 4)
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %443:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
- ; CHECK-NEXT: KILL [[S_ADD_U32_15]].sub0, [[S_ADD_U32_15]].sub1
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1
+ ; CHECK-NEXT: KILL undef %470:sreg_64
; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
- ; CHECK-NEXT: KILL undef %443:sreg_64
; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.252, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4)
; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s32) from %ir.291, align 8, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]]
; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
; CHECK-NEXT: [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
; CHECK-NEXT: [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
@@ -222,22 +224,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
- ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 96, 0 :: (invariant load (s128) from %ir.309, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.315, addrspace 4)
- ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
- ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
- ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4)
+ ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc
+ ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+ ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]]
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: KILL [[V_MOV_B32_e32_]]
- ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM27]]
+ ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]]
; CHECK-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -349,13 +351,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
; CHECK-NEXT: [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
; CHECK-NEXT: [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
; CHECK-NEXT: [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
- ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %516:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
+ ; CHECK-NEXT: [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %543:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) poison`, addrspace 4)
; CHECK-NEXT: [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
; CHECK-NEXT: [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
; CHECK-NEXT: [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
; CHECK-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
; CHECK-NEXT: undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
- ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %530:vgpr_32, undef %532:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+ ; CHECK-NEXT: IMAGE_STORE_V4_V2_nsa_gfx10 [[V_CNDMASK_B32_e64_]], undef %557:vgpr_32, undef %559:vgpr_32, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
; CHECK-NEXT: S_ENDPGM 0
.expVert:
%0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index ea9bb04..862e2dd 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT: s_sub_u32 s10, 0, s8
-; GCN-NEXT: s_subb_u32 s11, 0, s9
+; GCN-NEXT: s_sub_u32 s0, 0, s8
+; GCN-NEXT: s_subb_u32 s1, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s0
-; GCN-NEXT: s_mul_i32 s14, s10, s0
-; GCN-NEXT: s_add_i32 s1, s15, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s1, s1, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s13, s13, s15
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s0, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s1, s2
+; GCN-NEXT: s_mul_i32 s13, s0, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s15, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s15
; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s14, s12, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s0, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s1, s10, s14
-; GCN-NEXT: s_add_i32 s0, s0, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s0
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: v_readfirstlane_b32 s14, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s15, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s0, s10
+; GCN-NEXT: s_mul_i32 s1, s1, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s1, s11, s1
-; GCN-NEXT: s_addc_u32 s1, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s0, s12, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s1, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s0, s0, s11
+; GCN-NEXT: s_add_i32 s1, s12, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s1
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s0, s10, s0
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s0, s13, s0
+; GCN-NEXT: s_addc_u32 s0, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s1, s10, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s0
+; GCN-NEXT: s_addc_u32 s1, s10, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
; GCN-NEXT: v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
; GCN-NEXT: s_subb_u32 s13, s10, s9
; GCN-NEXT: s_sub_u32 s14, s6, s8
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
; GCN-NEXT: s_subb_u32 s15, s13, 0
; GCN-NEXT: s_cmp_ge_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_eq_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, s17, s16
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_subb_u32 s10, s13, s9
+; GCN-NEXT: s_sub_u32 s11, s14, s8
+; GCN-NEXT: s_subb_u32 s10, s10, 0
; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s11, s11, s14
; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-NEXT: s_cmp_lg_u32 s5, 0
; GCN-NEXT: s_cselect_b32 s4, s10, s4
; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -968,81 +956,76 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5]
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5
-; GCN-NEXT: s_sub_u32 s10, 0, s4
-; GCN-NEXT: s_subb_u32 s11, 0, s5
+; GCN-NEXT: s_sub_u32 s8, 0, s4
+; GCN-NEXT: s_subb_u32 s9, 0, s5
; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
; GCN-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v1
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s8, v0
-; GCN-NEXT: s_mul_i32 s9, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s8
-; GCN-NEXT: s_mul_i32 s14, s10, s8
-; GCN-NEXT: s_add_i32 s9, s15, s9
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s9, s9, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s9
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s8, s9
-; GCN-NEXT: s_add_u32 s13, s13, s15
-; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: v_mul_hi_u32 v0, v1, s9
-; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: s_mul_i32 s14, s12, s14
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s8, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s9, s2
+; GCN-NEXT: s_mul_i32 s13, s8, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s14, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s14
; GCN-NEXT: v_readfirstlane_b32 s14, v0
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s9, s12, s9
-; GCN-NEXT: s_add_u32 s9, s13, s9
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s8, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s8, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s9, v0
-; GCN-NEXT: s_add_i32 s8, s9, s8
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s9, s10, s14
-; GCN-NEXT: s_add_i32 s8, s8, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s9
-; GCN-NEXT: v_mov_b32_e32 v0, s8
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s8
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
+; GCN-NEXT: v_readfirstlane_b32 s15, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s14, s15
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s8, s10
+; GCN-NEXT: s_mul_i32 s9, s9, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s9, s12, s9
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s9, s11, s9
-; GCN-NEXT: s_addc_u32 s9, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s8, s12, s8
-; GCN-NEXT: s_add_u32 s8, s9, s8
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s8
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_addc_u32 s10, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s8, s8, s11
+; GCN-NEXT: s_add_i32 s9, s12, s9
+; GCN-NEXT: v_mov_b32_e32 v2, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s9
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s9
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s8, s10, s8
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s8, s13, s8
+; GCN-NEXT: s_addc_u32 s8, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s9, s10, s9
+; GCN-NEXT: s_add_u32 s8, s8, s9
+; GCN-NEXT: s_addc_u32 s9, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s8
+; GCN-NEXT: s_addc_u32 s10, s10, s9
; GCN-NEXT: s_ashr_i32 s8, s7, 31
; GCN-NEXT: s_add_u32 s6, s6, s8
; GCN-NEXT: s_mov_b32 s9, s8
@@ -1071,6 +1054,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-NEXT: s_addc_u32 s11, 0, s12
; GCN-NEXT: s_mul_i32 s11, s4, s11
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s12, v0
; GCN-NEXT: s_add_i32 s11, s12, s11
; GCN-NEXT: s_mul_i32 s12, s5, s10
@@ -1079,11 +1063,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_mul_i32 s10, s4, s10
; GCN-NEXT: s_sub_u32 s6, s6, s10
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s13, s10, s11
; GCN-NEXT: s_subb_u32 s15, s12, s5
; GCN-NEXT: s_sub_u32 s16, s6, s4
; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s17, s12, s13
; GCN-NEXT: s_subb_u32 s17, s15, 0
; GCN-NEXT: s_cmp_ge_u32 s17, s5
; GCN-NEXT: s_cselect_b32 s18, -1, 0
@@ -1092,13 +1074,11 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-NEXT: s_cmp_eq_u32 s17, s5
; GCN-NEXT: s_cselect_b32 s18, s19, s18
; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s15, s15, s5
-; GCN-NEXT: s_sub_u32 s19, s16, s4
-; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT: s_or_b32 s12, s12, s13
-; GCN-NEXT: s_subb_u32 s12, s15, 0
+; GCN-NEXT: s_subb_u32 s12, s15, s5
+; GCN-NEXT: s_sub_u32 s13, s16, s4
+; GCN-NEXT: s_subb_u32 s12, s12, 0
; GCN-NEXT: s_cmp_lg_u32 s18, 0
-; GCN-NEXT: s_cselect_b32 s13, s19, s16
+; GCN-NEXT: s_cselect_b32 s13, s13, s16
; GCN-NEXT: s_cselect_b32 s12, s12, s17
; GCN-NEXT: s_or_b32 s10, s10, s11
; GCN-NEXT: s_subb_u32 s7, s7, s14
@@ -1156,8 +1136,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s16, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s10, s10, s11
; GCN-IR-NEXT: s_addc_u32 s10, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s14, 63, s14
@@ -1189,8 +1167,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
; GCN-IR-NEXT: s_sub_u32 s14, s14, s20
; GCN-IR-NEXT: s_subb_u32 s15, s15, s21
; GCN-IR-NEXT: s_add_u32 s18, s18, 1
-; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
-; GCN-IR-NEXT: s_or_b32 s20, s20, s21
; GCN-IR-NEXT: s_addc_u32 s19, s19, 0
; GCN-IR-NEXT: s_cselect_b64 s[20:21], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3]
@@ -1316,8 +1292,7 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s4
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s5
; GCN-NEXT: s_sub_u32 s2, 0, s4
-; GCN-NEXT: s_subb_u32 s8, 0, s5
-; GCN-NEXT: s_mov_b32 s3, 0xf000
+; GCN-NEXT: s_subb_u32 s6, 0, s5
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1327,72 +1302,68 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
; GCN-NEXT: v_mul_hi_u32 v2, s2, v0
-; GCN-NEXT: v_readfirstlane_b32 s9, v1
-; GCN-NEXT: v_readfirstlane_b32 s6, v0
-; GCN-NEXT: s_mul_i32 s7, s2, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s6
-; GCN-NEXT: s_mul_i32 s11, s2, s6
-; GCN-NEXT: s_add_i32 s7, s12, s7
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s7, s7, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s7
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_mul_i32 s13, s6, s7
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s7
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: v_readfirstlane_b32 s11, v1
-; GCN-NEXT: s_addc_u32 s11, s11, 0
-; GCN-NEXT: s_mul_i32 s7, s9, s7
-; GCN-NEXT: s_add_u32 s7, s10, s7
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s6, s7
-; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_readfirstlane_b32 s7, v1
+; GCN-NEXT: v_readfirstlane_b32 s3, v0
+; GCN-NEXT: s_mul_i32 s8, s2, s7
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s6, s3
+; GCN-NEXT: s_mul_i32 s10, s2, s3
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_mul_i32 s12, s3, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s7, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_mul_i32 s8, s7, s8
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s3, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s6, s2, s9
-; GCN-NEXT: v_readfirstlane_b32 s7, v0
-; GCN-NEXT: s_add_i32 s6, s7, s6
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s2, s2, s11
-; GCN-NEXT: s_add_i32 s6, s6, s8
+; GCN-NEXT: s_addc_u32 s7, s7, s9
+; GCN-NEXT: s_mul_i32 s9, s2, s7
+; GCN-NEXT: s_mul_i32 s6, s6, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s2, s2, s8
+; GCN-NEXT: s_add_i32 s6, s9, s6
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: v_mov_b32_e32 v0, s6
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s6
+; GCN-NEXT: v_mul_hi_u32 v3, s7, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s7, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s6
; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s2, s9, s2
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s7, v3
-; GCN-NEXT: s_add_u32 s2, s8, s2
-; GCN-NEXT: s_addc_u32 s2, s10, s7
-; GCN-NEXT: v_readfirstlane_b32 s7, v1
-; GCN-NEXT: s_addc_u32 s7, s7, 0
-; GCN-NEXT: s_mul_i32 s6, s9, s6
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s2, s7, s2
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s2, s10, s2
+; GCN-NEXT: s_addc_u32 s2, s11, s9
+; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s6, s7, s6
; GCN-NEXT: s_add_u32 s2, s2, s6
-; GCN-NEXT: s_addc_u32 s8, 0, s7
-; GCN-NEXT: s_add_u32 s2, s11, s2
-; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s6, s6, s7
-; GCN-NEXT: s_addc_u32 s6, s9, s8
+; GCN-NEXT: s_addc_u32 s6, 0, s9
+; GCN-NEXT: s_add_u32 s2, s8, s2
+; GCN-NEXT: s_addc_u32 s6, s7, s6
; GCN-NEXT: v_mul_hi_u32 v1, s2, 24
; GCN-NEXT: v_mul_hi_u32 v0, s6, 24
; GCN-NEXT: s_mul_i32 s6, s6, 24
-; GCN-NEXT: s_mov_b32 s2, -1
+; GCN-NEXT: s_mov_b32 s3, 0xf000
; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s7, v0
; GCN-NEXT: s_add_u32 s6, s8, s6
@@ -1401,16 +1372,15 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
; GCN-NEXT: s_mul_i32 s7, s5, s6
; GCN-NEXT: s_mul_i32 s6, s4, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_readfirstlane_b32 s8, v0
; GCN-NEXT: s_add_i32 s10, s8, s7
; GCN-NEXT: s_sub_i32 s8, 0, s10
; GCN-NEXT: s_sub_u32 s11, 24, s6
; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT: s_or_b32 s9, s6, s7
; GCN-NEXT: s_subb_u32 s12, s8, s5
; GCN-NEXT: s_sub_u32 s13, s11, s4
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
; GCN-NEXT: s_subb_u32 s14, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s14, s5
; GCN-NEXT: s_cselect_b32 s15, -1, 0
@@ -1419,13 +1389,11 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_eq_u32 s14, s5
; GCN-NEXT: s_cselect_b32 s15, s16, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s5
-; GCN-NEXT: s_sub_u32 s16, s13, s4
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, s12, s5
+; GCN-NEXT: s_sub_u32 s9, s13, s4
+; GCN-NEXT: s_subb_u32 s8, s8, 0
; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s9, s9, s13
; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s6, s6, s7
; GCN-NEXT: s_subb_u32 s6, 0, s10
@@ -1468,8 +1436,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s8, s2, 1
-; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-IR-NEXT: s_or_b32 s9, s10, s11
; GCN-IR-NEXT: s_addc_u32 s3, s3, 0
; GCN-IR-NEXT: s_cselect_b64 s[10:11], -1, 0
; GCN-IR-NEXT: s_sub_i32 s2, 63, s2
@@ -1500,8 +1466,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bdd22f25..b000fae 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_add_u32 s2, s2, s8
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: s_addc_u32 s3, s3, s9
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
@@ -433,8 +431,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_add_u32 s4, s4, s6
-; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT: s_or_b32 s6, s12, s13
; SI-NEXT: s_addc_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fd461ac..775483c 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -146,8 +146,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -179,8 +177,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_sub_u32 s12, s12, s16
; GCN-IR-NEXT: s_subb_u32 s13, s13, s17
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[2:3], s[4:5]
@@ -786,12 +782,11 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-LABEL: s_test_udiv_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT: s_sub_u32 s6, 0, s2
-; GCN-NEXT: s_subb_u32 s8, 0, s3
+; GCN-NEXT: s_sub_u32 s4, 0, s2
+; GCN-NEXT: s_subb_u32 s5, 0, s3
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -800,118 +795,112 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-NEXT: v_readfirstlane_b32 s7, v0
+; GCN-NEXT: s_mul_i32 s8, s4, s6
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s5, s7
+; GCN-NEXT: s_mul_i32 s10, s4, s7
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_mul_i32 s12, s7, s8
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s6, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s8
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s7, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT: s_addc_u32 s6, s6, s9
+; GCN-NEXT: s_mul_i32 s9, s4, s6
+; GCN-NEXT: s_mul_i32 s5, s5, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s4, s4, s8
+; GCN-NEXT: s_add_i32 s5, s9, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s5
+; GCN-NEXT: v_readfirstlane_b32 s12, v2
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s4, s10, s4
+; GCN-NEXT: s_addc_u32 s4, s11, s9
; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s5, s6, s5
+; GCN-NEXT: s_add_u32 s4, s4, s5
+; GCN-NEXT: s_addc_u32 s5, 0, s9
+; GCN-NEXT: s_add_u32 s4, s8, s4
+; GCN-NEXT: s_addc_u32 s5, s6, s5
+; GCN-NEXT: v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT: v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT: s_mul_i32 s5, s5, 24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-NEXT: s_mul_i32 s5, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s4
-; GCN-NEXT: s_mul_i32 s11, s6, s4
-; GCN-NEXT: s_add_i32 s5, s12, s5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s5, s5, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT: s_mul_i32 s13, s4, s5
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: s_addc_u32 s11, s14, 0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_add_u32 s5, s10, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s4, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s4, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s5, s6, s11
-; GCN-NEXT: s_add_i32 s4, s4, s8
-; GCN-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s4
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
; GCN-NEXT: s_add_u32 s5, s8, s5
-; GCN-NEXT: s_addc_u32 s5, s10, s6
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_addc_u32 s6, s6, 0
-; GCN-NEXT: s_mul_i32 s4, s9, s4
-; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: s_addc_u32 s6, 0, s6
-; GCN-NEXT: s_add_u32 s8, s11, s4
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s4, s9, s6
-; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT: s_mul_i32 s4, s4, 24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_readfirstlane_b32 s8, v1
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s10, 0, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s10
+; GCN-NEXT: s_addc_u32 s8, 0, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
; GCN-NEXT: s_mov_b32 s5, s1
-; GCN-NEXT: s_mul_i32 s0, s3, s10
+; GCN-NEXT: s_mul_i32 s0, s3, s8
; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s11, s1, s0
-; GCN-NEXT: s_sub_i32 s8, 0, s11
-; GCN-NEXT: s_mul_i32 s0, s2, s10
-; GCN-NEXT: s_sub_u32 s12, 24, s0
+; GCN-NEXT: s_add_i32 s9, s1, s0
+; GCN-NEXT: s_sub_i32 s10, 0, s9
+; GCN-NEXT: s_mul_i32 s0, s2, s8
+; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s9, s0, s1
-; GCN-NEXT: s_subb_u32 s13, s8, s3
-; GCN-NEXT: s_sub_u32 s14, s12, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s13, 0
-; GCN-NEXT: s_cmp_ge_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s9, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s14, s2
+; GCN-NEXT: s_subb_u32 s10, s10, s3
+; GCN-NEXT: s_sub_u32 s12, s11, s2
+; GCN-NEXT: s_subb_u32 s10, s10, 0
+; GCN-NEXT: s_cmp_ge_u32 s10, s3
; GCN-NEXT: s_cselect_b32 s13, -1, 0
-; GCN-NEXT: s_cmp_eq_u32 s8, s3
-; GCN-NEXT: s_cselect_b32 s8, s13, s9
-; GCN-NEXT: s_add_u32 s9, s10, 1
+; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cselect_b32 s12, -1, 0
+; GCN-NEXT: s_cmp_eq_u32 s10, s3
+; GCN-NEXT: s_cselect_b32 s10, s12, s13
+; GCN-NEXT: s_add_u32 s12, s8, 1
; GCN-NEXT: s_addc_u32 s13, 0, 0
-; GCN-NEXT: s_add_u32 s14, s10, 2
+; GCN-NEXT: s_add_u32 s14, s8, 2
; GCN-NEXT: s_addc_u32 s15, 0, 0
-; GCN-NEXT: s_cmp_lg_u32 s8, 0
-; GCN-NEXT: s_cselect_b32 s8, s14, s9
-; GCN-NEXT: s_cselect_b32 s9, s15, s13
+; GCN-NEXT: s_cmp_lg_u32 s10, 0
+; GCN-NEXT: s_cselect_b32 s10, s14, s12
+; GCN-NEXT: s_cselect_b32 s12, s15, s13
; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_subb_u32 s0, 0, s11
+; GCN-NEXT: s_subb_u32 s0, 0, s9
; GCN-NEXT: s_cmp_ge_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s1, -1, 0
-; GCN-NEXT: s_cmp_ge_u32 s12, s2
+; GCN-NEXT: s_cmp_ge_u32 s11, s2
; GCN-NEXT: s_cselect_b32 s2, -1, 0
; GCN-NEXT: s_cmp_eq_u32 s0, s3
; GCN-NEXT: s_cselect_b32 s0, s2, s1
; GCN-NEXT: s_cmp_lg_u32 s0, 0
-; GCN-NEXT: s_cselect_b32 s0, s9, 0
-; GCN-NEXT: s_cselect_b32 s1, s8, s10
+; GCN-NEXT: s_cselect_b32 s0, s12, 0
+; GCN-NEXT: s_cselect_b32 s1, s10, s8
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,8 +926,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -969,8 +956,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1307,8 +1292,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1336,8 +1319,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s2, s2, s8
; GCN-IR-NEXT: s_subb_u32 s3, s3, 0
; GCN-IR-NEXT: s_add_u32 s10, s10, 1
-; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
-; GCN-IR-NEXT: s_or_b32 s12, s12, s13
; GCN-IR-NEXT: s_addc_u32 s11, s11, 0
; GCN-IR-NEXT: s_cselect_b64 s[12:13], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 137dc1f..28e6627 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -8,12 +8,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd
; GCN-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9
; GCN-NEXT: s_mov_b32 s3, 0xf000
-; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s8
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT: s_sub_u32 s10, 0, s8
-; GCN-NEXT: s_subb_u32 s11, 0, s9
+; GCN-NEXT: s_sub_u32 s0, 0, s8
+; GCN-NEXT: s_subb_u32 s1, 0, s9
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -22,69 +21,65 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s10, v0
-; GCN-NEXT: v_readfirstlane_b32 s12, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
-; GCN-NEXT: s_mul_i32 s1, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_mul_i32 s13, s11, s0
-; GCN-NEXT: s_mul_i32 s14, s10, s0
-; GCN-NEXT: s_add_i32 s1, s15, s1
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s14
-; GCN-NEXT: s_add_i32 s1, s1, s13
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s1
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s14
-; GCN-NEXT: v_readfirstlane_b32 s13, v3
-; GCN-NEXT: s_mul_i32 s15, s0, s1
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s1
-; GCN-NEXT: s_add_u32 s13, s13, s15
+; GCN-NEXT: v_mul_hi_u32 v2, s0, v0
+; GCN-NEXT: v_readfirstlane_b32 s10, v1
+; GCN-NEXT: v_readfirstlane_b32 s2, v0
+; GCN-NEXT: s_mul_i32 s11, s0, s10
+; GCN-NEXT: v_readfirstlane_b32 s14, v2
+; GCN-NEXT: s_mul_i32 s12, s1, s2
+; GCN-NEXT: s_mul_i32 s13, s0, s2
+; GCN-NEXT: s_add_i32 s11, s14, s11
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s13
+; GCN-NEXT: s_add_i32 s11, s11, s12
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s11
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s13
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_mul_i32 s15, s2, s11
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s11
+; GCN-NEXT: s_add_u32 s12, s12, s15
; GCN-NEXT: v_readfirstlane_b32 s15, v0
-; GCN-NEXT: s_mul_i32 s14, s12, s14
+; GCN-NEXT: s_mul_i32 s13, s10, s13
; GCN-NEXT: s_addc_u32 s15, 0, s15
-; GCN-NEXT: v_readfirstlane_b32 s16, v4
-; GCN-NEXT: s_add_u32 s13, s13, s14
-; GCN-NEXT: s_addc_u32 s13, s15, s16
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s14, s14, 0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_add_u32 s1, s13, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s14
-; GCN-NEXT: s_add_u32 s14, s0, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s14
-; GCN-NEXT: v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s12, s12, s13
-; GCN-NEXT: s_mul_i32 s0, s10, s12
-; GCN-NEXT: v_readfirstlane_b32 s1, v0
-; GCN-NEXT: s_add_i32 s0, s1, s0
-; GCN-NEXT: s_mul_i32 s11, s11, s14
-; GCN-NEXT: s_mul_i32 s1, s10, s14
-; GCN-NEXT: s_add_i32 s0, s0, s11
-; GCN-NEXT: v_mov_b32_e32 v2, s1
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mul_hi_u32 v3, s12, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s14, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s12, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s14, v0
-; GCN-NEXT: s_mul_i32 s11, s14, s0
-; GCN-NEXT: v_readfirstlane_b32 s15, v2
-; GCN-NEXT: s_add_u32 s11, s15, s11
+; GCN-NEXT: v_readfirstlane_b32 s14, v4
+; GCN-NEXT: s_add_u32 s12, s12, s13
+; GCN-NEXT: s_addc_u32 s12, s15, s14
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s13, s13, 0
+; GCN-NEXT: s_mul_i32 s11, s10, s11
+; GCN-NEXT: s_add_u32 s11, s12, s11
+; GCN-NEXT: s_addc_u32 s12, 0, s13
+; GCN-NEXT: s_add_u32 s11, s2, s11
+; GCN-NEXT: v_mov_b32_e32 v0, s11
+; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
+; GCN-NEXT: s_addc_u32 s10, s10, s12
+; GCN-NEXT: s_mul_i32 s12, s0, s10
+; GCN-NEXT: s_mul_i32 s1, s1, s11
; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s1, s12, s1
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: s_add_u32 s1, s11, s1
-; GCN-NEXT: s_addc_u32 s1, s13, s10
-; GCN-NEXT: v_readfirstlane_b32 s10, v1
-; GCN-NEXT: s_addc_u32 s10, s10, 0
-; GCN-NEXT: s_mul_i32 s0, s12, s0
-; GCN-NEXT: s_add_u32 s0, s1, s0
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: s_add_u32 s11, s14, s0
-; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s0, s0, s1
-; GCN-NEXT: s_addc_u32 s1, s12, s10
+; GCN-NEXT: s_add_i32 s12, s13, s12
+; GCN-NEXT: s_mul_i32 s0, s0, s11
+; GCN-NEXT: s_add_i32 s1, s12, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mul_hi_u32 v3, s10, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s10, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT: s_mul_i32 s13, s11, s1
+; GCN-NEXT: v_readfirstlane_b32 s15, v2
+; GCN-NEXT: s_add_u32 s13, s15, s13
+; GCN-NEXT: v_readfirstlane_b32 s14, v0
+; GCN-NEXT: s_mul_i32 s0, s10, s0
+; GCN-NEXT: s_addc_u32 s14, 0, s14
+; GCN-NEXT: v_readfirstlane_b32 s12, v3
+; GCN-NEXT: s_add_u32 s0, s13, s0
+; GCN-NEXT: s_addc_u32 s0, s14, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v1
+; GCN-NEXT: s_addc_u32 s12, s12, 0
+; GCN-NEXT: s_mul_i32 s1, s10, s1
+; GCN-NEXT: s_add_u32 s0, s0, s1
+; GCN-NEXT: s_addc_u32 s1, 0, s12
+; GCN-NEXT: s_add_u32 s11, s11, s0
+; GCN-NEXT: s_addc_u32 s1, s10, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
; GCN-NEXT: v_mov_b32_e32 v2, s11
@@ -118,11 +113,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_mul_i32 s4, s8, s4
; GCN-NEXT: s_sub_u32 s6, s6, s4
; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s11, s4, s5
; GCN-NEXT: s_subb_u32 s13, s10, s9
; GCN-NEXT: s_sub_u32 s14, s6, s8
; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s15, s10, s11
; GCN-NEXT: s_subb_u32 s15, s13, 0
; GCN-NEXT: s_cmp_ge_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, -1, 0
@@ -131,13 +124,11 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_cmp_eq_u32 s15, s9
; GCN-NEXT: s_cselect_b32 s16, s17, s16
; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s13, s13, s9
-; GCN-NEXT: s_sub_u32 s17, s14, s8
-; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT: s_or_b32 s10, s10, s11
-; GCN-NEXT: s_subb_u32 s10, s13, 0
+; GCN-NEXT: s_subb_u32 s10, s13, s9
+; GCN-NEXT: s_sub_u32 s11, s14, s8
+; GCN-NEXT: s_subb_u32 s10, s10, 0
; GCN-NEXT: s_cmp_lg_u32 s16, 0
-; GCN-NEXT: s_cselect_b32 s11, s17, s14
+; GCN-NEXT: s_cselect_b32 s11, s11, s14
; GCN-NEXT: s_cselect_b32 s10, s10, s15
; GCN-NEXT: s_or_b32 s4, s4, s5
; GCN-NEXT: s_subb_u32 s4, s7, s12
@@ -150,6 +141,7 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-NEXT: s_cmp_lg_u32 s5, 0
; GCN-NEXT: s_cselect_b32 s4, s10, s4
; GCN-NEXT: s_cselect_b32 s5, s11, s6
+; GCN-NEXT: s_mov_b32 s2, -1
; GCN-NEXT: v_mov_b32_e32 v0, s5
; GCN-NEXT: v_mov_b32_e32 v1, s4
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -180,8 +172,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s14, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-IR-NEXT: s_or_b32 s8, s8, s9
; GCN-IR-NEXT: s_addc_u32 s8, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[8:9], -1, 0
; GCN-IR-NEXT: s_sub_i32 s12, 63, s12
@@ -213,8 +203,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
; GCN-IR-NEXT: s_sub_u32 s12, s12, s18
; GCN-IR-NEXT: s_subb_u32 s13, s13, s19
; GCN-IR-NEXT: s_add_u32 s16, s16, 1
-; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
-; GCN-IR-NEXT: s_or_b32 s18, s18, s19
; GCN-IR-NEXT: s_addc_u32 s17, s17, 0
; GCN-IR-NEXT: s_cselect_b64 s[18:19], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
@@ -803,12 +791,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-LABEL: s_test_urem_k_num_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9
-; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s2
; GCN-NEXT: v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT: s_sub_u32 s6, 0, s2
-; GCN-NEXT: s_subb_u32 s8, 0, s3
+; GCN-NEXT: s_sub_u32 s4, 0, s2
+; GCN-NEXT: s_subb_u32 s5, 0, s3
; GCN-NEXT: v_madmk_f32 v0, v1, 0x4f800000, v0
; GCN-NEXT: v_rcp_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -817,77 +804,73 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: v_madmk_f32 v0, v1, 0xcf800000, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
; GCN-NEXT: v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT: v_mul_hi_u32 v2, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v2, s4, v0
+; GCN-NEXT: v_readfirstlane_b32 s6, v1
+; GCN-NEXT: v_readfirstlane_b32 s7, v0
+; GCN-NEXT: s_mul_i32 s8, s4, s6
+; GCN-NEXT: v_readfirstlane_b32 s11, v2
+; GCN-NEXT: s_mul_i32 s9, s5, s7
+; GCN-NEXT: s_mul_i32 s10, s4, s7
+; GCN-NEXT: s_add_i32 s8, s11, s8
+; GCN-NEXT: v_mul_hi_u32 v3, v0, s10
+; GCN-NEXT: s_add_i32 s8, s8, s9
+; GCN-NEXT: v_mul_hi_u32 v0, v0, s8
+; GCN-NEXT: v_mul_hi_u32 v4, v1, s10
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_mul_i32 s12, s7, s8
+; GCN-NEXT: v_mul_hi_u32 v1, v1, s8
+; GCN-NEXT: s_add_u32 s9, s9, s12
+; GCN-NEXT: v_readfirstlane_b32 s12, v0
+; GCN-NEXT: s_mul_i32 s10, s6, s10
+; GCN-NEXT: s_addc_u32 s12, 0, s12
+; GCN-NEXT: v_readfirstlane_b32 s11, v4
+; GCN-NEXT: s_add_u32 s9, s9, s10
+; GCN-NEXT: v_readfirstlane_b32 s13, v1
+; GCN-NEXT: s_addc_u32 s9, s12, s11
+; GCN-NEXT: s_mul_i32 s8, s6, s8
+; GCN-NEXT: s_addc_u32 s10, s13, 0
+; GCN-NEXT: s_add_u32 s8, s9, s8
+; GCN-NEXT: s_addc_u32 s9, 0, s10
+; GCN-NEXT: s_add_u32 s8, s7, s8
+; GCN-NEXT: v_mov_b32_e32 v0, s8
+; GCN-NEXT: v_mul_hi_u32 v0, s4, v0
+; GCN-NEXT: s_addc_u32 s6, s6, s9
+; GCN-NEXT: s_mul_i32 s9, s4, s6
+; GCN-NEXT: s_mul_i32 s5, s5, s8
+; GCN-NEXT: v_readfirstlane_b32 s10, v0
+; GCN-NEXT: s_add_i32 s9, s10, s9
+; GCN-NEXT: s_mul_i32 s4, s4, s8
+; GCN-NEXT: s_add_i32 s5, s9, s5
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: v_mov_b32_e32 v0, s5
+; GCN-NEXT: v_mul_hi_u32 v3, s6, v2
+; GCN-NEXT: v_mul_hi_u32 v2, s8, v2
+; GCN-NEXT: v_mul_hi_u32 v1, s6, v0
+; GCN-NEXT: v_mul_hi_u32 v0, s8, v0
+; GCN-NEXT: s_mul_i32 s10, s8, s5
+; GCN-NEXT: v_readfirstlane_b32 s12, v2
+; GCN-NEXT: s_add_u32 s10, s12, s10
+; GCN-NEXT: v_readfirstlane_b32 s11, v0
+; GCN-NEXT: s_mul_i32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s11, 0, s11
+; GCN-NEXT: v_readfirstlane_b32 s9, v3
+; GCN-NEXT: s_add_u32 s4, s10, s4
+; GCN-NEXT: s_addc_u32 s4, s11, s9
; GCN-NEXT: v_readfirstlane_b32 s9, v1
+; GCN-NEXT: s_addc_u32 s9, s9, 0
+; GCN-NEXT: s_mul_i32 s5, s6, s5
+; GCN-NEXT: s_add_u32 s4, s4, s5
+; GCN-NEXT: s_addc_u32 s5, 0, s9
+; GCN-NEXT: s_add_u32 s4, s8, s4
+; GCN-NEXT: s_addc_u32 s5, s6, s5
+; GCN-NEXT: v_mul_hi_u32 v1, s4, 24
+; GCN-NEXT: v_mul_hi_u32 v0, s5, 24
+; GCN-NEXT: s_mul_i32 s5, s5, 24
+; GCN-NEXT: s_mov_b32 s7, 0xf000
+; GCN-NEXT: v_readfirstlane_b32 s8, v1
; GCN-NEXT: v_readfirstlane_b32 s4, v0
-; GCN-NEXT: s_mul_i32 s5, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_mul_i32 s10, s8, s4
-; GCN-NEXT: s_mul_i32 s11, s6, s4
-; GCN-NEXT: s_add_i32 s5, s12, s5
-; GCN-NEXT: v_mul_hi_u32 v3, v0, s11
-; GCN-NEXT: s_add_i32 s5, s5, s10
-; GCN-NEXT: v_mul_hi_u32 v0, v0, s5
-; GCN-NEXT: v_mul_hi_u32 v4, v1, s11
-; GCN-NEXT: v_readfirstlane_b32 s10, v3
-; GCN-NEXT: v_mul_hi_u32 v1, v1, s5
-; GCN-NEXT: s_mul_i32 s13, s4, s5
-; GCN-NEXT: s_add_u32 s10, s10, s13
-; GCN-NEXT: v_readfirstlane_b32 s13, v0
-; GCN-NEXT: s_mul_i32 s11, s9, s11
-; GCN-NEXT: s_addc_u32 s13, 0, s13
-; GCN-NEXT: v_readfirstlane_b32 s12, v4
-; GCN-NEXT: s_add_u32 s10, s10, s11
-; GCN-NEXT: v_readfirstlane_b32 s14, v1
-; GCN-NEXT: s_addc_u32 s10, s13, s12
-; GCN-NEXT: s_addc_u32 s11, s14, 0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_add_u32 s5, s10, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s11
-; GCN-NEXT: s_add_u32 s11, s4, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s11
-; GCN-NEXT: v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s9, s9, s10
-; GCN-NEXT: s_mul_i32 s4, s6, s9
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_i32 s4, s5, s4
-; GCN-NEXT: s_mul_i32 s8, s8, s11
-; GCN-NEXT: s_mul_i32 s5, s6, s11
-; GCN-NEXT: s_add_i32 s4, s4, s8
-; GCN-NEXT: v_mov_b32_e32 v2, s5
-; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: v_mul_hi_u32 v3, s9, v2
-; GCN-NEXT: v_mul_hi_u32 v2, s11, v2
-; GCN-NEXT: v_mul_hi_u32 v1, s9, v0
-; GCN-NEXT: v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT: s_mul_i32 s8, s11, s4
-; GCN-NEXT: v_readfirstlane_b32 s12, v2
-; GCN-NEXT: s_add_u32 s8, s12, s8
-; GCN-NEXT: v_readfirstlane_b32 s10, v0
-; GCN-NEXT: s_mul_i32 s5, s9, s5
-; GCN-NEXT: s_addc_u32 s10, 0, s10
-; GCN-NEXT: v_readfirstlane_b32 s6, v3
; GCN-NEXT: s_add_u32 s5, s8, s5
-; GCN-NEXT: s_addc_u32 s5, s10, s6
-; GCN-NEXT: v_readfirstlane_b32 s6, v1
-; GCN-NEXT: s_addc_u32 s6, s6, 0
-; GCN-NEXT: s_mul_i32 s4, s9, s4
-; GCN-NEXT: s_add_u32 s4, s5, s4
-; GCN-NEXT: s_addc_u32 s6, 0, s6
-; GCN-NEXT: s_add_u32 s8, s11, s4
-; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT: s_or_b32 s4, s4, s5
-; GCN-NEXT: s_addc_u32 s4, s9, s6
-; GCN-NEXT: v_mul_hi_u32 v1, s8, 24
-; GCN-NEXT: v_mul_hi_u32 v0, s4, 24
-; GCN-NEXT: s_mul_i32 s4, s4, 24
-; GCN-NEXT: s_mov_b32 s6, -1
-; GCN-NEXT: v_readfirstlane_b32 s8, v1
-; GCN-NEXT: v_readfirstlane_b32 s5, v0
-; GCN-NEXT: s_add_u32 s4, s8, s4
-; GCN-NEXT: s_addc_u32 s8, 0, s5
+; GCN-NEXT: s_addc_u32 s8, 0, s4
; GCN-NEXT: v_mov_b32_e32 v0, s8
; GCN-NEXT: v_mul_hi_u32 v0, s2, v0
; GCN-NEXT: s_mov_b32 s4, s0
@@ -899,11 +882,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_mul_i32 s0, s2, s8
; GCN-NEXT: s_sub_u32 s11, 24, s0
; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT: s_or_b32 s8, s0, s1
; GCN-NEXT: s_subb_u32 s12, s9, s3
; GCN-NEXT: s_sub_u32 s13, s11, s2
; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s14, s8, s9
; GCN-NEXT: s_subb_u32 s14, s12, 0
; GCN-NEXT: s_cmp_ge_u32 s14, s3
; GCN-NEXT: s_cselect_b32 s15, -1, 0
@@ -912,13 +893,11 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_eq_u32 s14, s3
; GCN-NEXT: s_cselect_b32 s15, s16, s15
; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s12, s12, s3
-; GCN-NEXT: s_sub_u32 s16, s13, s2
-; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT: s_or_b32 s8, s8, s9
-; GCN-NEXT: s_subb_u32 s8, s12, 0
+; GCN-NEXT: s_subb_u32 s8, s12, s3
+; GCN-NEXT: s_sub_u32 s9, s13, s2
+; GCN-NEXT: s_subb_u32 s8, s8, 0
; GCN-NEXT: s_cmp_lg_u32 s15, 0
-; GCN-NEXT: s_cselect_b32 s9, s16, s13
+; GCN-NEXT: s_cselect_b32 s9, s9, s13
; GCN-NEXT: s_cselect_b32 s8, s8, s14
; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: s_subb_u32 s0, 0, s10
@@ -931,6 +910,7 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-NEXT: s_cmp_lg_u32 s1, 0
; GCN-NEXT: s_cselect_b32 s0, s8, s0
; GCN-NEXT: s_cselect_b32 s1, s9, s11
+; GCN-NEXT: s_mov_b32 s6, -1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -956,8 +936,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s10, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -988,8 +966,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s10, s10, s16
; GCN-IR-NEXT: s_subb_u32 s11, s11, s17
; GCN-IR-NEXT: s_add_u32 s14, s14, 1
-; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
-; GCN-IR-NEXT: s_or_b32 s16, s16, s17
; GCN-IR-NEXT: s_addc_u32 s15, s15, 0
; GCN-IR-NEXT: s_cselect_b64 s[16:17], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5]
@@ -1077,8 +1053,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5
; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1
; GCN-IR-NEXT: s_add_u32 s11, s8, 1
-; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
-; GCN-IR-NEXT: s_or_b32 s6, s6, s7
; GCN-IR-NEXT: s_addc_u32 s6, s9, 0
; GCN-IR-NEXT: s_cselect_b64 s[6:7], -1, 0
; GCN-IR-NEXT: s_sub_i32 s8, 63, s8
@@ -1106,8 +1080,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
; GCN-IR-NEXT: s_sub_u32 s8, s8, s10
; GCN-IR-NEXT: s_subb_u32 s9, s9, 0
; GCN-IR-NEXT: s_add_u32 s12, s12, 1
-; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
-; GCN-IR-NEXT: s_or_b32 s14, s14, s15
; GCN-IR-NEXT: s_addc_u32 s13, s13, 0
; GCN-IR-NEXT: s_cselect_b64 s[14:15], -1, 0
; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index e8db647..8a54ad3 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -15,10 +15,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_sub_u32 s2, s2, s8
-; SI-NEXT: s_mov_b32 s5, s1
-; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
-; SI-NEXT: s_or_b32 s0, s0, s1
; SI-NEXT: s_subb_u32 s3, s3, s9
+; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; SI-NEXT: v_mov_b32_e32 v1, s3
@@ -432,8 +430,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
; SI-NEXT: s_mov_b32 s10, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_sub_u32 s4, s4, s6
-; SI-NEXT: s_cselect_b64 s[12:13], -1, 0
-; SI-NEXT: s_or_b32 s6, s12, s13
; SI-NEXT: s_subb_u32 s5, s5, s7
; SI-NEXT: s_mov_b32 s8, s0
; SI-NEXT: s_mov_b32 s9, s1
diff --git a/llvm/test/CodeGen/PowerPC/vp-ld-st.ll b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll
new file mode 100644
index 0000000..f0f9943
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/vp-ld-st.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 \
+; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN: -mtriple=powerpc64le-unknown-unknown < %s | FileCheck -check-prefix=FUTURE %s
+
+; RUN: llc -verify-machineinstrs -mcpu=pwr10 \
+; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=future \
+; RUN: -mtriple=powerpc64-unknown-unknown < %s | FileCheck --check-prefix=FUTURE %s
+
+; Function Attrs: nounwind readnone
+define void @stxvl1(<16 x i8> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 3, 6, 56
+; CHECK-NEXT: stxvl 34, 5, 3
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: stxvl1:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: stxvrl 34, 5, 6
+; FUTURE-NEXT: blr
+entry:
+ %cconv = trunc i64 %c to i32
+ tail call void @llvm.vp.store.v16i8.p0(<16 x i8> %a, ptr %b, <16 x i1> splat (i1 true), i32 %cconv)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl2(<8 x i16> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 3, 6, 57
+; CHECK-NEXT: stxvl 34, 5, 3
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: stxvl2:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 3, 6, 1
+; FUTURE-NEXT: stxvrl 34, 5, 3
+; FUTURE-NEXT: blr
+entry:
+ %cconv = trunc i64 %c to i32
+ tail call void @llvm.vp.store.v8i16.p0(<8 x i16> %a, ptr %b, <8 x i1> splat (i1 true), i32 %cconv)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl4(<4 x i32> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 3, 6, 58
+; CHECK-NEXT: stxvl 34, 5, 3
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: stxvl4:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 3, 6, 2
+; FUTURE-NEXT: stxvrl 34, 5, 3
+; FUTURE-NEXT: blr
+entry:
+ %cconv = trunc i64 %c to i32
+ tail call void @llvm.vp.store.v4i32.p0(<4 x i32> %a, ptr %b, <4 x i1> splat (i1 true), i32 %cconv)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define void @stxvl8(<2 x i64> %a, ptr %b, i64 %c) {
+; CHECK-LABEL: stxvl8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 3, 6, 59
+; CHECK-NEXT: stxvl 34, 5, 3
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: stxvl8:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 3, 6, 3
+; FUTURE-NEXT: stxvrl 34, 5, 3
+; FUTURE-NEXT: blr
+entry:
+ %cconv = trunc i64 %c to i32
+ tail call void @llvm.vp.store.v2i64.p0(<2 x i64> %a, ptr %b, <2 x i1> splat (i1 true), i32 %cconv)
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+define <16 x i8> @lxvl1(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 4, 4, 56
+; CHECK-NEXT: lxvl 34, 3, 4
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: lxvl1:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: lxvrl 34, 3, 4
+; FUTURE-NEXT: blr
+entry:
+ %bconv = trunc i64 %b to i32
+ %0 = tail call <16 x i8> @llvm.vp.load.v16i8.p0(ptr %a, <16 x i1> splat (i1 true), i32 %bconv)
+ ret <16 x i8> %0
+}
+
+; Function Attrs: nounwind readnone
+define <8 x i16> @lxvl2(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 4, 4, 57
+; CHECK-NEXT: lxvl 34, 3, 4
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: lxvl2:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 4, 4, 1
+; FUTURE-NEXT: lxvrl 34, 3, 4
+; FUTURE-NEXT: blr
+entry:
+ %bconv = trunc i64 %b to i32
+ %0 = tail call <8 x i16> @llvm.vp.load.v8i16.p0(ptr %a, <8 x i1> splat (i1 true), i32 %bconv)
+ ret <8 x i16> %0
+}
+
+; Function Attrs: nounwind readnone
+define <4 x i32> @lxvl4(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 4, 4, 58
+; CHECK-NEXT: lxvl 34, 3, 4
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: lxvl4:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 4, 4, 2
+; FUTURE-NEXT: lxvrl 34, 3, 4
+; FUTURE-NEXT: blr
+entry:
+ %bconv = trunc i64 %b to i32
+ %0 = tail call <4 x i32> @llvm.vp.load.v4i32.p0(ptr %a, <4 x i1> splat (i1 true), i32 %bconv)
+ ret <4 x i32> %0
+}
+
+; Function Attrs: nounwind readnone
+define <2 x i64> @lxvl8(ptr %a, i64 %b) {
+; CHECK-LABEL: lxvl8:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: sldi 4, 4, 59
+; CHECK-NEXT: lxvl 34, 3, 4
+; CHECK-NEXT: blr
+;
+; FUTURE-LABEL: lxvl8:
+; FUTURE: # %bb.0: # %entry
+; FUTURE-NEXT: sldi 4, 4, 3
+; FUTURE-NEXT: lxvrl 34, 3, 4
+; FUTURE-NEXT: blr
+entry:
+ %bconv = trunc i64 %b to i32
+ %0 = tail call <2 x i64> @llvm.vp.load.v2i64.p0(ptr %a, <2 x i1> splat (i1 true), i32 %bconv)
+ ret <2 x i64> %0
+}
diff --git a/llvm/test/DebugInfo/extradata-node-reference.ll b/llvm/test/DebugInfo/extradata-node-reference.ll
index 0ec9312..1881756 100644
--- a/llvm/test/DebugInfo/extradata-node-reference.ll
+++ b/llvm/test/DebugInfo/extradata-node-reference.ll
@@ -29,7 +29,7 @@
!1 = !DIFile(filename: "test.cpp", directory: ".")
!2 = !{i32 2, !"Debug Info Version", i32 3}
!3 = !{i32 1, !"wchar_size", i32 4}
-!4 = !{i32 2, !"Dwarf Version", i32 5}
+!4 = !{i32 2, !"Dwarf Version", i32 4}
!8 = !{!9, !16, !24, !35}
; extraData node definitions
diff --git a/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll
new file mode 100644
index 0000000..9531323
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/sink-dereferenceable-assume.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -p instcombine -S %s | FileCheck %s
+
+define i64 @test_sink_with_dereferenceable_assume(ptr %p, ptr %q, i1 %cond) {
+; CHECK-LABEL: define i64 @test_sink_with_dereferenceable_assume(
+; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK: [[THEN]]:
+; CHECK-NEXT: [[Q_INT:%.*]] = ptrtoint ptr [[Q]] to i64
+; CHECK-NEXT: [[P_INT:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT: [[DIFF:%.*]] = sub i64 [[Q_INT]], [[P_INT]]
+; CHECK-NEXT: ret i64 [[DIFF]]
+; CHECK: [[ELSE]]:
+; CHECK-NEXT: ret i64 0
+;
+entry:
+ %p_int = ptrtoint ptr %p to i64
+ %q_int = ptrtoint ptr %q to i64
+ %diff = sub i64 %q_int, %p_int
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %p, i64 %diff) ]
+ br i1 %cond, label %then, label %else
+
+then:
+ ret i64 %diff
+
+else:
+ ret i64 0
+}
+
+declare void @llvm.assume(i1 noundef)
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
index 338d925..33e3e83 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/std-find.ll
@@ -49,10 +49,10 @@ entry:
call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %0, i64 256) ]
%start.ptr = load ptr, ptr %first, align 8
%1 = load i64, ptr %first, align 8
- %coerce.val.pi.i = add i64 %1, 256
- %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
- %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
- br i1 %cmp.not6.i.i, label %return, label %loop.ph
+ %coerce.val.p = add i64 %1, 256
+ %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr
+ %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip
+ br i1 %ec6., label %return, label %loop.ph
loop.ph:
%2 = load i16, ptr %s.addr, align 2
@@ -61,13 +61,13 @@ loop.ph:
loop.header:
%ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
%3 = load i16, ptr %ptr.iv, align 2
- %cmp2.i.i = icmp eq i16 %3, %2
- br i1 %cmp2.i.i, label %return, label %loop.latch
+ %cmp2. = icmp eq i16 %3, %2
+ br i1 %cmp2., label %return, label %loop.latch
loop.latch:
%ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
- %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
- br i1 %cmp.not.i.i, label %return, label %loop.header
+ %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+ br i1 %ec., label %return, label %loop.header
return:
%merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
@@ -103,10 +103,10 @@ entry:
%0 = load ptr, ptr %first, align 8
%start.ptr = load ptr, ptr %first, align 8
%1 = load i64, ptr %first, align 8
- %coerce.val.pi.i = add i64 %1, 256
- %coerce.val.ip = inttoptr i64 %coerce.val.pi.i to ptr
- %cmp.not6.i.i = icmp eq ptr %start.ptr, %coerce.val.ip
- br i1 %cmp.not6.i.i, label %return, label %loop.ph
+ %coerce.val.p = add i64 %1, 256
+ %coerce.val.ip = inttoptr i64 %coerce.val.p to ptr
+ %ec6. = icmp eq ptr %start.ptr, %coerce.val.ip
+ br i1 %ec6., label %return, label %loop.ph
loop.ph:
%2 = load i16, ptr %s.addr, align 2
@@ -115,13 +115,13 @@ loop.ph:
loop.header:
%ptr.iv = phi ptr [ %start.ptr, %loop.ph ], [ %ptr.iv.next, %loop.latch ]
%3 = load i16, ptr %ptr.iv, align 2
- %cmp2.i.i = icmp eq i16 %3, %2
- br i1 %cmp2.i.i, label %return, label %loop.latch
+ %cmp2. = icmp eq i16 %3, %2
+ br i1 %cmp2., label %return, label %loop.latch
loop.latch:
%ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
- %cmp.not.i.i = icmp eq ptr %ptr.iv.next, %coerce.val.ip
- br i1 %cmp.not.i.i, label %return, label %loop.header
+ %ec. = icmp eq ptr %ptr.iv.next, %coerce.val.ip
+ br i1 %ec., label %return, label %loop.header
return:
%merge = phi ptr [ %start.ptr, %entry ], [ %coerce.val.ip, %loop.latch ], [ %ptr.iv, %loop.header ]
@@ -129,9 +129,118 @@ return:
ret i64 %res
}
+define ptr @std_find_caller(ptr noundef %first, ptr noundef %last) {
+; CHECK-LABEL: define noundef ptr @std_find_caller(
+; CHECK-SAME: ptr noundef [[FIRST:%.*]], ptr noundef [[LAST:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[FIRST]], i64 2) ]
+; CHECK-NEXT: call void @llvm.assume(i1 true) [ "align"(ptr [[LAST]], i64 2) ]
+; CHECK-NEXT: [[PRE_I:%.*]] = icmp eq ptr [[FIRST]], [[LAST]]
+; CHECK-NEXT: br i1 [[PRE_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT:.*]], label %[[LOOP_HEADER_I_PREHEADER:.*]]
+; CHECK: [[LOOP_HEADER_I_PREHEADER]]:
+; CHECK-NEXT: [[LAST2:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[FIRST3:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[LAST_I64:%.*]] = ptrtoint ptr [[LAST]] to i64
+; CHECK-NEXT: [[FIRST1:%.*]] = ptrtoint ptr [[FIRST]] to i64
+; CHECK-NEXT: [[PTR_SUB:%.*]] = sub i64 [[LAST_I64]], [[FIRST1]]
+; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[PTR_SUB]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[LAST2]], -2
+; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[FIRST3]]
+; CHECK-NEXT: [[TMP2:%.*]] = lshr exact i64 [[TMP1]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 1
+; CHECK-NEXT: [[XTRAITER:%.*]] = and i64 [[TMP3]], 3
+; CHECK-NEXT: [[TMP4:%.*]] = and i64 [[TMP1]], 6
+; CHECK-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[TMP4]], 6
+; CHECK-NEXT: br i1 [[LCMP_MOD_NOT]], label %[[LOOP_HEADER_I_PROL_LOOPEXIT:.*]], label %[[LOOP_HEADER_I_PROL:.*]]
+; CHECK: [[LOOP_HEADER_I_PROL]]:
+; CHECK-NEXT: [[PTR_IV_I_PROL:%.*]] = phi ptr [ [[PTR_IV_NEXT_I_PROL:%.*]], %[[LOOP_LATCH_I_PROL:.*]] ], [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ]
+; CHECK-NEXT: [[PROL_ITER:%.*]] = phi i64 [ [[PROL_ITER_NEXT:%.*]], %[[LOOP_LATCH_I_PROL]] ], [ 0, %[[LOOP_HEADER_I_PREHEADER]] ]
+; CHECK-NEXT: [[L_I_PROL:%.*]] = load i16, ptr [[PTR_IV_I_PROL]], align 2
+; CHECK-NEXT: [[C_1_I_PROL:%.*]] = icmp eq i16 [[L_I_PROL]], 1
+; CHECK-NEXT: br i1 [[C_1_I_PROL]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_LATCH_I_PROL]]
+; CHECK: [[LOOP_LATCH_I_PROL]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_PROL]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I_PROL]], i64 2
+; CHECK-NEXT: [[PROL_ITER_NEXT]] = add i64 [[PROL_ITER]], 1
+; CHECK-NEXT: [[PROL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[PROL_ITER_NEXT]], [[XTRAITER]]
+; CHECK-NEXT: br i1 [[PROL_ITER_CMP_NOT]], label %[[LOOP_HEADER_I_PROL_LOOPEXIT]], label %[[LOOP_HEADER_I_PROL]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[LOOP_HEADER_I_PROL_LOOPEXIT]]:
+; CHECK-NEXT: [[PTR_IV_I_UNR:%.*]] = phi ptr [ [[FIRST]], %[[LOOP_HEADER_I_PREHEADER]] ], [ [[PTR_IV_NEXT_I_PROL]], %[[LOOP_LATCH_I_PROL]] ]
+; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP1]], 6
+; CHECK-NEXT: br i1 [[TMP5]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I:.*]]
+; CHECK: [[LOOP_HEADER_I]]:
+; CHECK-NEXT: [[PTR_IV_I:%.*]] = phi ptr [ [[PTR_IV_NEXT_I_3:%.*]], %[[LOOP_LATCH_I_3:.*]] ], [ [[PTR_IV_I_UNR]], %[[LOOP_HEADER_I_PROL_LOOPEXIT]] ]
+; CHECK-NEXT: [[L_I:%.*]] = load i16, ptr [[PTR_IV_I]], align 2
+; CHECK-NEXT: [[C_1_I:%.*]] = icmp eq i16 [[L_I]], 1
+; CHECK-NEXT: br i1 [[C_1_I]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_LATCH_I:.*]]
+; CHECK: [[LOOP_LATCH_I]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 2
+; CHECK-NEXT: [[L_I_1:%.*]] = load i16, ptr [[PTR_IV_NEXT_I]], align 2
+; CHECK-NEXT: [[C_1_I_1:%.*]] = icmp eq i16 [[L_I_1]], 1
+; CHECK-NEXT: br i1 [[C_1_I_1]], label %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT11:.*]], label %[[LOOP_LATCH_I_1:.*]]
+; CHECK: [[LOOP_LATCH_I_1]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_1:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 4
+; CHECK-NEXT: [[L_I_2:%.*]] = load i16, ptr [[PTR_IV_NEXT_I_1]], align 2
+; CHECK-NEXT: [[C_1_I_2:%.*]] = icmp eq i16 [[L_I_2]], 1
+; CHECK-NEXT: br i1 [[C_1_I_2]], label %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT9:.*]], label %[[LOOP_LATCH_I_2:.*]]
+; CHECK: [[LOOP_LATCH_I_2]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_2:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 6
+; CHECK-NEXT: [[L_I_3:%.*]] = load i16, ptr [[PTR_IV_NEXT_I_2]], align 2
+; CHECK-NEXT: [[C_1_I_3:%.*]] = icmp eq i16 [[L_I_3]], 1
+; CHECK-NEXT: br i1 [[C_1_I_3]], label %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT7:.*]], label %[[LOOP_LATCH_I_3]]
+; CHECK: [[LOOP_LATCH_I_3]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_3]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 8
+; CHECK-NEXT: [[EC_I_3:%.*]] = icmp eq ptr [[PTR_IV_NEXT_I_3]], [[LAST]]
+; CHECK-NEXT: br i1 [[EC_I_3]], label %[[STD_FIND_GENERIC_IMPL_EXIT]], label %[[LOOP_HEADER_I]]
+; CHECK: [[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT7]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_2_LE:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 6
+; CHECK-NEXT: br label %[[STD_FIND_GENERIC_IMPL_EXIT]]
+; CHECK: [[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT9]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_1_LE:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 4
+; CHECK-NEXT: br label %[[STD_FIND_GENERIC_IMPL_EXIT]]
+; CHECK: [[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT11]]:
+; CHECK-NEXT: [[PTR_IV_NEXT_I_LE:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_IV_I]], i64 2
+; CHECK-NEXT: br label %[[STD_FIND_GENERIC_IMPL_EXIT]]
+; CHECK: [[STD_FIND_GENERIC_IMPL_EXIT]]:
+; CHECK-NEXT: [[RES_I:%.*]] = phi ptr [ [[FIRST]], %[[ENTRY]] ], [ [[SCEVGEP]], %[[LOOP_HEADER_I_PROL_LOOPEXIT]] ], [ [[PTR_IV_NEXT_I_2_LE]], %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT7]] ], [ [[PTR_IV_NEXT_I_1_LE]], %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT9]] ], [ [[PTR_IV_NEXT_I_LE]], %[[STD_FIND_GENERIC_IMPL_EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT_SPLIT_LOOP_EXIT11]] ], [ [[SCEVGEP]], %[[LOOP_LATCH_I_3]] ], [ [[PTR_IV_I]], %[[LOOP_HEADER_I]] ], [ [[PTR_IV_I_PROL]], %[[LOOP_HEADER_I_PROL]] ]
+; CHECK-NEXT: ret ptr [[RES_I]]
+;
+entry:
+ %last.i64 = ptrtoint ptr %last to i64
+ %first.i64 = ptrtoint ptr %first to i64
+ %ptr.sub = sub i64 %last.i64, %first.i64
+ call void @llvm.assume(i1 true) [ "align"(ptr %first, i64 2) ]
+ call void @llvm.assume(i1 true) [ "align"(ptr %last, i64 2) ]
+ call void @llvm.assume(i1 true) [ "dereferenceable"(ptr %first, i64 %ptr.sub) ]
+ %call = call noundef ptr @std_find_generic_impl(ptr noundef nonnull %first, ptr noundef %last, i16 noundef signext 1)
+ ret ptr %call
+}
+
+define linkonce_odr noundef ptr @std_find_generic_impl(ptr noundef %first, ptr noundef %last, i16 noundef %value) {
+entry:
+ %pre = icmp eq ptr %first, %last
+ br i1 %pre, label %exit, label %loop.header
+
+loop.header:
+ %ptr.iv = phi ptr [ %ptr.iv.next, %loop.latch ], [ %first, %entry ]
+ %l = load i16, ptr %ptr.iv, align 2
+ %c.1 = icmp eq i16 %l, %value
+ br i1 %c.1, label %exit, label %loop.latch
+
+loop.latch:
+ %ptr.iv.next = getelementptr inbounds nuw i8, ptr %ptr.iv, i64 2
+ %ec = icmp eq ptr %ptr.iv.next, %last
+ br i1 %ec, label %exit, label %loop.header
+
+exit:
+ %res = phi ptr [ %first, %entry ], [ %ptr.iv, %loop.header ], [ %ptr.iv.next, %loop.latch ]
+ ret ptr %res
+}
+
declare void @llvm.assume(i1 noundef)
;.
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META4:![0-9]+]]}
+; CHECK: [[META4]] = !{!"llvm.loop.unroll.disable"}
;.
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
index 2c1d73e..9f3244d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
@@ -498,11 +498,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
; PR58139
define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: @_mm_complexmult_pd_naive(
-; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
-; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
index fa6403f..de64bf2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
@@ -502,11 +502,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
; PR58139
define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: @_mm_complexmult_pd_naive(
-; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
-; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
-; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
-; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
+; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
index 5358e04..88fcf35 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
ret <4 x float> %r
}
+define <2 x float> @ext2_v4f32v2f32(<4 x float> %x, <2 x float> %y) {
+; CHECK-LABEL: @ext2_v4f32v2f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x float> [[R]]
+;
+ %e = extractelement <4 x float> %x, i32 3
+ %n = fneg float %e
+ %r = insertelement <2 x float> %y, float %n, i32 1
+ ret <2 x float> %r
+}
+
; Eliminating extract/insert is still profitable. Flags propagate.
define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
@@ -73,17 +86,11 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
}
define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
-; SSE-LABEL: @ext1_v2f64v4f64(
-; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
-; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
-; SSE-NEXT: ret <4 x double> [[R]]
-;
-; AVX-LABEL: @ext1_v2f64v4f64(
-; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
-; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
-; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
-; AVX-NEXT: ret <4 x double> [[R]]
+; CHECK-LABEL: @ext1_v2f64v4f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+; CHECK-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
@@ -91,6 +98,19 @@ define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
ret <4 x double> %r
}
+define <2 x double> @ext1_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: @ext1_v4f64v2f64(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: ret <2 x double> [[R]]
+;
+ %e = extractelement <4 x double> %x, i32 3
+ %n = fneg nsz double %e
+ %r = insertelement <2 x double> %y, double %n, i32 1
+ ret <2 x double> %r
+}
+
define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v8f32(
; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
@@ -105,9 +125,9 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32(
-; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
; CHECK-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
@@ -116,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+define <4 x float> @ext7_v8f32v4f32(<8 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext7_v8f32v4f32(
+; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT: ret <4 x float> [[R]]
+;
+ %e = extractelement <8 x float> %x, i32 7
+ %n = fneg float %e
+ %r = insertelement <4 x float> %y, float %n, i32 3
+ ret <4 x float> %r
+}
+
; Same as above with an extra use of the extracted element.
define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
@@ -141,12 +174,20 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
}
define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v4f32v8f32_use1(
-; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
-; CHECK-NEXT: call void @use(float [[E]])
-; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
-; CHECK-NEXT: ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v4f32v8f32_use1(
+; SSE-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; SSE-NEXT: call void @use(float [[E]])
+; SSE-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X]]
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v4f32v8f32_use1(
+; AVX-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
+; AVX-NEXT: call void @use(float [[E]])
+; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
+; AVX-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
+; AVX-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
call void @use(float %e)
@@ -155,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+define <4 x float> @ext7_v8f32v4f32_use1(<8 x float> %x, <4 x float> %y) {
+; SSE-LABEL: @ext7_v8f32v4f32_use1(
+; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; SSE-NEXT: call void @use(float [[E]])
+; SSE-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; SSE-NEXT: ret <4 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32v4f32_use1(
+; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; AVX-NEXT: call void @use(float [[E]])
+; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
+; AVX-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
+; AVX-NEXT: ret <4 x float> [[R]]
+;
+ %e = extractelement <8 x float> %x, i32 7
+ call void @use(float %e)
+ %n = fneg float %e
+ %r = insertelement <4 x float> %y, float %n, i32 3
+ ret <4 x float> %r
+}
+
; Negative test - the transform is likely not profitable if the fneg has another use.
define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
@@ -187,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}
+define <4 x float> @ext7_v8f32v4f32_use2(<8 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: @ext7_v8f32v4f32_use2(
+; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
+; CHECK-NEXT: call void @use(float [[N]])
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
+; CHECK-NEXT: ret <4 x float> [[R]]
+;
+ %e = extractelement <8 x float> %x, i32 7
+ %n = fneg float %e
+ call void @use(float %n)
+ %r = insertelement <4 x float> %y, float %n, i32 3
+ ret <4 x float> %r
+}
+
; Negative test - can't convert variable index to a shuffle.
define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
@@ -215,14 +294,10 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y,
ret <4 x double> %r
}
-; Negative test - require same extract/insert index for simple shuffle.
-; TODO: We could handle this by adjusting the cost calculation.
-
define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_v2f64_ins0(
-; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0
+; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
@@ -231,12 +306,11 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}
-; Negative test - extract from an index greater than the vector width of the destination
define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext3_v4f64v2f64(
-; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
-; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
+; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
+; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <4 x double> %x, i32 3
@@ -246,11 +320,17 @@ define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
}
define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) {
-; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
-; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
-; CHECK-NEXT: ret <4 x double> [[R]]
+; SSE-LABEL: @ext1_v2f64v4f64_ins0(
+; SSE-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
+; SSE-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
+; SSE-NEXT: ret <4 x double> [[R]]
+;
+; AVX-LABEL: @ext1_v2f64v4f64_ins0(
+; AVX-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
+; AVX-NEXT: [[N:%.*]] = fneg nsz double [[E]]
+; AVX-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
+; AVX-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index 35ea8b8..725ddb87 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -233,6 +233,7 @@ tools.extend(
"llvm-addr2line",
"llvm-bcanalyzer",
"llvm-bitcode-strip",
+ "llvm-cas",
"llvm-cgdata",
"llvm-config",
"llvm-cov",
@@ -796,10 +797,19 @@ if config.have_opt_viewer_modules:
if config.expensive_checks:
config.available_features.add("expensive_checks")
+if config.have_ondisk_cas:
+ config.available_features.add("ondisk_cas")
+
if "MemoryWithOrigins" in config.llvm_use_sanitizer:
config.available_features.add("use_msan_with_origins")
+# Restrict the size of the on-disk CAS for tests. This allows testing in
+# constrained environments (e.g. small TMPDIR). It also prevents leaving
+# behind large files on file systems that do not support sparse files if a test
+# crashes before resizing the file.
+config.environment["LLVM_CAS_MAX_MAPPING_SIZE"] = "%d" % (100 * 1024 * 1024)
+
# Some tools support an environment variable "OBJECT_MODE" on AIX OS, which
# controls the kind of objects they will support. If there is no "OBJECT_MODE"
# environment variable specified, the default behaviour is to support 32-bit
diff --git a/llvm/test/lit.site.cfg.py.in b/llvm/test/lit.site.cfg.py.in
index 973e0ec9..c5cb716 100644
--- a/llvm/test/lit.site.cfg.py.in
+++ b/llvm/test/lit.site.cfg.py.in
@@ -66,6 +66,7 @@ config.spirv_tools_tests = @LLVM_INCLUDE_SPIRV_TOOLS_TESTS@
config.have_vc_rev = @LLVM_APPEND_VC_REV@
config.force_vc_rev = "@LLVM_FORCE_VC_REVISION@"
config.has_logf128 = @LLVM_HAS_LOGF128@
+config.have_ondisk_cas = @LLVM_ENABLE_ONDISK_CAS@
import lit.llvm
lit.llvm.initialize(lit_config, config)
diff --git a/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test
new file mode 100644
index 0000000..cef40b4
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/typedefs-with-same-name.test
@@ -0,0 +1,41 @@
+#RUN: dsymutil --linker=parallel -f -oso-prepend-path=%p/../Inputs/ -y %s -o %t.dwarf
+#RUN: llvm-dwarfdump %t.dwarf | FileCheck %s
+
+# There should be two typedef DIE named "BarInt" in the resultant .dwarf file.
+# The second should refer to the first, which refer to "Foo<int>".
+# CHECK: 0x[[FIRST_BARINT_ADDR:[0-9a-f]*]]: DW_TAG_typedef
+# CHECK-NEXT: DW_AT_type (0x{{([[:xdigit:]]*)}} "Foo<int>")
+# CHECK-NEXT: DW_AT_name ("BarInt")
+# CHECK: 0x{{([[:xdigit:]]*)}}: DW_TAG_typedef
+# CHECK-NEXT: DW_AT_type (0x[[FIRST_BARINT_ADDR]] "BarInt")
+# CHECK-NEXT: DW_AT_name ("BarInt")
+
+# Source:
+#
+# template <typename T> struct Foo;
+# typedef Foo<int> BarInt;
+# template <typename T>
+# struct [[clang::preferred_name(BarInt)]] Foo{};
+# int main() {
+# BarInt barInt;
+# return 0;
+# }
+#
+# Compile with:
+#
+# $ clang++ -g -O0 -c typedefs-with-same-name.cpp -o typedefs-with-same-name.o
+#
+# To generate the debug map:
+#
+# $ clang++ typedefs-with-same-name.o -o typedefs-with-same-name
+# $ dsymutil -dump-debug-map typedefs-with-same-name
+
+---
+triple: 'arm64-apple-darwin'
+objects:
+ - filename: '/typedefs-with-same-name.o'
+ timestamp: 1762438746
+ type: 102
+ symbols:
+ - { sym: _main, objAddr: 0x0, binAddr: 0x100000360, size: 0x14 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o
new file mode 100644
index 0000000..6cc47c1
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/typedefs-with-same-name.o
Binary files differ
diff --git a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
index d028194..fd15ce3 100644
--- a/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
+++ b/llvm/test/tools/dsymutil/X86/DWARFLinkerParallel/odr-fwd-declaration.test
@@ -35,14 +35,14 @@ void foo() { Sptrptr ptr1 = 0; }
// CHECK: DW_TAG_member
// CHECK-NEXT: DW_AT_name{{.*}}"field"
-// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef
-// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *"
-// CHECK-NEXT: DW_AT_name{{.*}}"Sptr"
-
// CHECK: 0x[[TYPEDEF_PTR_PTR_S:[a-f0-9]*]]: DW_TAG_typedef
// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_PTR_S]]} "Sptr *"
// CHECK-NEXT: DW_AT_name{{.*}}"Sptrptr"
+// CHECK: 0x[[TYPEDEF_PTR_S]]: DW_TAG_typedef
+// CHECK-NEXT: DW_AT_type{{.*}}{0x[[PTR_S]]} "S *"
+// CHECK-NEXT: DW_AT_name{{.*}}"Sptr"
+
// First we confirm that first compile unit properly references type.
//
// CHECK: DW_TAG_compile_unit
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline b/llvm/test/tools/llvm-cas/Inputs/oneline
new file mode 100644
index 0000000..d95f3ad
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline
@@ -0,0 +1 @@
+content
diff --git a/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
new file mode 100644
index 0000000..6b584e8
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/Inputs/oneline-nonewline
@@ -0,0 +1 @@
+content \ No newline at end of file
diff --git a/llvm/test/tools/llvm-cas/action-cache.test b/llvm/test/tools/llvm-cas/action-cache.test
new file mode 100644
index 0000000..fcb212c
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/action-cache.test
@@ -0,0 +1,14 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline >%t/oneline.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid
+
+RUN: llvm-cas --cas %t.cas --put-cache-key @%t/oneline.casid @%t/oneline-nonewline.casid
+RUN: llvm-cas --cas %t.cas --get-cache-result @%t/oneline.casid > %t/result.casid
+RUN: diff %t/oneline-nonewline.casid %t/result.casid
+
+RUN: not llvm-cas --cas %t.cas --get-cache-result @%t/oneline-nonewline.casid 2>&1 | FileCheck %s
+CHECK: result not found
diff --git a/llvm/test/tools/llvm-cas/cache.test b/llvm/test/tools/llvm-cas/cache.test
new file mode 100644
index 0000000..f0ce691
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/cache.test
@@ -0,0 +1,14 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data /dev/null > %t/empty.casid
+RUN: echo "abc" | \
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - >%t/abc.casid
+
+RUN: llvm-cas --cas %t/cas --put-cache-key @%t/abc.casid @%t/empty.casid
+RUN: llvm-cas --cas %t/cas --get-cache-result @%t/abc.casid > %t/empty2.casid
+RUN: diff %t/empty.casid %t/empty2.casid
+
+RUN: not llvm-cas --cas %t/cas --get-cache-result @%t/empty.casid
diff --git a/llvm/test/tools/llvm-cas/dump.test b/llvm/test/tools/llvm-cas/dump.test
new file mode 100644
index 0000000..f23bac6c
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/dump.test
@@ -0,0 +1,27 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t/cas --make-blob \
+RUN: --data - </dev/null
+
+RUN: llvm-cas --cas %t/cas --make-blob \
+RUN: --data %s
+
+RUN: llvm-cas --cas %t/cas --dump | FileCheck %s
+
+// check the dump format.
+CHECK: index:
+CHECK-NEXT: hash-num-bits=
+CHECK-NEXT: root addr=
+// it should has at least one index
+CHECK-NEXT: - index=
+
+// two records
+CHECK: record
+CHECK-NEXT: - addr=
+CHECK-NEXT: - addr=
+
+// both should be small enough to be in data pool
+CHECK: pool:
+CHECK-NEXT: - addr=
+CHECK-NEXT: - addr=
diff --git a/llvm/test/tools/llvm-cas/lit.local.cfg b/llvm/test/tools/llvm-cas/lit.local.cfg
new file mode 100644
index 0000000..379945b
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.have_ondisk_cas:
+ config.unsupported = True
diff --git a/llvm/test/tools/llvm-cas/make-blob.test b/llvm/test/tools/llvm-cas/make-blob.test
new file mode 100644
index 0000000..532a3a3
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-blob.test
@@ -0,0 +1,41 @@
+RUN: rm -rf %t %t.cas
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - </dev/null >%t/empty.casid
+RUN: sed -e 's,^.,CHECK: ,' <%t/empty.casid >%t/empty.check
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data /dev/null | FileCheck %t/empty.check
+RUN: echo "abc" | \
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data - >%t/abc.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline >%t/oneline.casid
+RUN: llvm-cas --cas %t.cas --make-blob \
+RUN: --data %S/Inputs/oneline-nonewline >%t/oneline-nonewline.casid
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+CHECK-EMPTY-NOT: {{.}}
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/abc.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ABC
+CHECK-ABC: abc
+
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ONELINE
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid |\
+RUN: FileCheck %s -check-prefix CHECK-ONELINE
+CHECK-ONELINE: content
+
+# Double-check newlines.
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline-nonewline.casid \
+RUN: >%t/oneline-nonewline
+RUN: diff %S/Inputs/oneline-nonewline %t/oneline-nonewline
+RUN: llvm-cas --cas %t.cas --cat-node-data @%t/oneline.casid \
+RUN: >%t/oneline
+RUN: diff %S/Inputs/oneline %t/oneline
+
+# Validate
+RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline-nonewline.casid
+RUN: llvm-cas --cas %t.cas --validate-object @%t/oneline.casid
diff --git a/llvm/test/tools/llvm-cas/make-node.test b/llvm/test/tools/llvm-cas/make-node.test
new file mode 100644
index 0000000..de548af
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/make-node.test
@@ -0,0 +1,37 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+# Make some empty objects.
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN: --data - </dev/null >%t/empty.casid
+
+RUN: llvm-cas --cas %t/cas --cat-node-data @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/empty.casid |\
+RUN: FileCheck %s -check-prefix CHECK-EMPTY -allow-empty
+CHECK-EMPTY-NOT: {{.}}
+
+# Make a complex object, which references existing ones. Reference a blob and
+# other objects, and reference one of them twice to be sure they don't get
+# deduped.
+RUN: llvm-cas --cas %t/cas --make-blob --data /dev/null \
+RUN: >%t/empty-blob.casid
+RUN: cat %t/empty.casid %t/empty.casid %t/empty-blob.casid \
+RUN: >%t/complex.refs
+RUN: cat %t/complex.refs | sed -e 's,^.,CHECK: ,' > %t/complex.check
+RUN: llvm-cas --cas %t/cas --make-node \
+RUN: --data %S/Inputs/oneline @%t/complex.refs \
+RUN: >%t/complex.casid
+RUN: llvm-cas --cas %t/cas --cat-node-data \
+RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA
+RUN: llvm-cas --cas %t/cas --ls-node-refs @%t/complex.casid |\
+RUN: FileCheck %t/complex.check
+COMPLEX-DATA: content
+
+RUN: llvm-cas --cas %t/cas --validate-object @%t/complex.casid
+
+# Import from a new CAS.
+RUN: llvm-cas --cas %t/new-cas --upstream-cas %t/cas --import @%t/complex.casid
+RUN: llvm-cas --cas %t/new-cas --cat-node-data \
+RUN: @%t/complex.casid | FileCheck %s -check-prefix COMPLEX-DATA
+RUN: llvm-cas --cas %t/new-cas --validate
diff --git a/llvm/test/tools/llvm-cas/print-id.test b/llvm/test/tools/llvm-cas/print-id.test
new file mode 100644
index 0000000..5a2efd5
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/print-id.test
@@ -0,0 +1,13 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+RUN: llvm-cas --cas %t/cas --make-blob --data %S/Inputs/oneline > %t/id
+
+# Confirm that the ID has the right prefix, is well-formed, and that there's
+# nothing else on the line.
+RUN: FileCheck %s --match-full-lines --strict-whitespace <%t/id
+CHECK:llvmcas://{{[a-z0-9]+}}
+
+# Confirm that there's a newline after.
+RUN: wc -l <%t/id | FileCheck %s -check-prefix=NEWLINE
+NEWLINE: 1
diff --git a/llvm/test/tools/llvm-cas/validation.test b/llvm/test/tools/llvm-cas/validation.test
new file mode 100644
index 0000000..13f24f0
--- /dev/null
+++ b/llvm/test/tools/llvm-cas/validation.test
@@ -0,0 +1,31 @@
+RUN: rm -rf %t
+RUN: mkdir %t
+
+# Ingest a blob which just fits inside the CAS data pool to make sure the validate passes.
+RUN: truncate -s 7 %t/file
+RUN: cat %t/file | \
+RUN: llvm-cas --cas %t/cas --make-blob \
+RUN: --data -
+RUN: llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: llvm-cas --cas %t/cas --validate
+RUN: llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: rm %t/cas/v1.1/data.v1
+RUN: not llvm-cas --cas %t/cas --validate
+RUN: not llvm-cas --cas %t/cas --validate --check-hash
+
+RUN: mkdir %t/ac
+
+RUN: llvm-cas --cas %t/ac --make-blob \
+RUN: --data /dev/null > %t/empty.casid
+RUN: echo "abc" | \
+RUN: llvm-cas --cas %t/ac --make-blob \
+RUN: --data - >%t/abc.casid
+
+RUN: llvm-cas --cas %t/ac --put-cache-key @%t/abc.casid @%t/empty.casid
+RUN: llvm-cas --cas %t/ac --validate
+# Note: records are 40 bytes (32 hash bytes + 8 byte value), so trim the last
+# allocated record, leaving it invalid.
+RUN: truncate -s -40 %t/ac/v1.1/actions.v1
+RUN: not llvm-cas --cas %t/ac --validate