diff options
author | Matt Arsenault <Matthew.Arsenault@amd.com> | 2023-06-20 06:19:08 -0400 |
---|---|---|
committer | Matt Arsenault <Matthew.Arsenault@amd.com> | 2023-07-25 07:54:11 -0400 |
commit | 47b3ada432f8afee9723a4b3d27b3efbef34dedf (patch) | |
tree | 151f331eecc6640c66d5ea70030d494cd0067c61 /llvm | |
parent | 71be91eba96d80d15689e4f516141c533c3c086d (diff) | |
download | llvm-47b3ada432f8afee9723a4b3d27b3efbef34dedf.zip llvm-47b3ada432f8afee9723a4b3d27b3efbef34dedf.tar.gz llvm-47b3ada432f8afee9723a4b3d27b3efbef34dedf.tar.bz2 |
AMDGPU: Add more sqrt f64 lowering tests
Almost all permutations of the flags are potentially relevant.
Diffstat (limited to 'llvm')
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll | 218 | ||||
-rw-r--r-- | llvm/test/CodeGen/AMDGPU/rsq.f64.ll | 2812 |
2 files changed, 2727 insertions, 303 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll index dbf3871..6239393 100644 --- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll @@ -1,9 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s define double @v_sqrt_f64(double %x) { ; GCN-LABEL: v_sqrt_f64: @@ -115,9 +115,219 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) { ret <2 x i32> %insert.1 } +define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) { +; GCN-LABEL: s_sqrt_f64_afn: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: ; return to shader part epilog + %result = call afn double @llvm.sqrt.f64(double %x) + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) { +; GCN-LABEL: s_sqrt_f64_afn_nnan_ninf: +; GCN: ; %bb.0: +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; GCN-NEXT: v_readfirstlane_b32 s0, v0 +; GCN-NEXT: v_readfirstlane_b32 s1, v1 +; GCN-NEXT: ; return to shader part epilog + %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_sqrt_f64_nsz(double %x) { +; GCN-LABEL: v_sqrt_f64_nsz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64_nnan_ninf(double %x) { +; GCN-LABEL: v_sqrt_f64_nnan_ninf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nnan ninf double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64_nnan_ninf_nsz(double %x) { +; GCN-LABEL: v_sqrt_f64_nnan_ninf_nsz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nnan ninf nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64_afn(double %x) { +; GCN-LABEL: v_sqrt_f64_afn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64_afn_nsz(double %x) { +; GCN-LABEL: v_sqrt_f64_afn_nsz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) { +; GCN-LABEL: v_sqrt_v2f64_afn: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + ret <2 x double> %result +} + +define double @v_sqrt_f64_afn_nnan(double %x) { +; GCN-LABEL: v_sqrt_f64_afn_nnan: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn nnan double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64_fabs_afn_ninf(double %x) { +; GCN-LABEL: v_sqrt_f64_fabs_afn_ninf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; GCN-NEXT: s_setpc_b64 s[30:31] + %fabs = call double @llvm.fabs.f64(double %x) + %result = call afn ninf double @llvm.sqrt.f64(double %fabs) + ret double %result +} + +define double @v_sqrt_f64_afn_nnan_ninf(double %x) { +; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn nnan ninf double @llvm.sqrt.f64(double %x) + ret double %result +} + +define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) { +; GCN-LABEL: v_sqrt_v2f64_afn_nnan_ninf: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + ret <2 x double> %result +} + +define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) { +; GCN-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call afn nnan ninf nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 { +; GCN-LABEL: v_sqrt_f64__approx_func_fp_math: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 { +; GCN-LABEL: v_sqrt_f64__enough_unsafe_attrs: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define double @v_sqrt_f64__unsafe_attr(double %x) #4 { +; GCN-LABEL: v_sqrt_f64__unsafe_attr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call nsz double @llvm.sqrt.f64(double %x) + ret double %result +} + +define <2 x double> @v_sqrt_v2f64(<2 x double> %x) { +; GCN-LABEL: v_sqrt_v2f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + ret <2 x double> %result +} + +define <3 x double> @v_sqrt_v3f64(<3 x double> %x) { +; GCN-LABEL: v_sqrt_v3f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; GCN-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; GCN-NEXT: v_sqrt_f64_e32 v[4:5], v[4:5] +; GCN-NEXT: s_setpc_b64 s[30:31] + %result = call <3 x double> @llvm.sqrt.v3f64(<3 x double> %x) + ret <3 x double> %result +} + declare double @llvm.fabs.f64(double) #0 declare double @llvm.sqrt.f64(double) #0 +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) #0 +declare <3 x double> @llvm.sqrt.v3f64(<3 x double>) #0 declare i32 @llvm.amdgcn.readfirstlane(i32) #1 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } attributes #1 = { convergent nounwind willreturn memory(none) } +attributes #2 = { "approx-func-fp-math"="true" } +attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" } +attributes #4 = { "unsafe-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GISEL: {{.*}} +; SDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index 93a8869..a20aaac 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -1,302 +1,2516 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE,SI,SI-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GCN-SAFE,SI,SI-SAFE %s - -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GCN,GCN-UNSAFE,CI,CI-UNSAFE %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=GCN,GCN-SAFE,CI,CI-SAFE %s - -declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone -declare double @llvm.sqrt.f64(double) nounwind readnone - - -define amdgpu_kernel void @rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { -; GCN-UNSAFE-LABEL: rsq_f64: -; GCN-UNSAFE: ; %bb.0: -; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-UNSAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN-UNSAFE-NEXT: s_endpgm -; -; SI-SAFE-LABEL: rsq_f64: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; SI-SAFE-NEXT: s_mov_b32 s2, -1 -; SI-SAFE-NEXT: s_mov_b32 s10, s2 -; SI-SAFE-NEXT: s_mov_b32 s11, s3 -; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-SAFE-NEXT: s_mov_b32 s8, s6 -; SI-SAFE-NEXT: s_mov_b32 s9, s7 -; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 -; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: s_mov_b32 s0, 0x3ff00000 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 -; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] -; SI-SAFE-NEXT: s_mov_b32 s0, s4 -; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-SAFE-NEXT: s_mov_b32 s1, s5 -; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-SAFE-NEXT: s_endpgm -; -; CI-SAFE-LABEL: rsq_f64: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-SAFE-NEXT: s_mov_b32 s11, s7 -; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; CI-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-SAFE-NEXT: s_mov_b32 s5, s1 -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], 1.0 -; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, 1.0, v[0:1], 1.0 -; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 -; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CI-SAFE-NEXT: s_endpgm - %val = load double, ptr addrspace(1) %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone - %div = fdiv double 1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @neg_rsq_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { -; GCN-UNSAFE-LABEL: neg_rsq_f64: -; GCN-UNSAFE: ; %bb.0: -; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-UNSAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN-UNSAFE-NEXT: s_endpgm -; -; SI-SAFE-LABEL: neg_rsq_f64: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; SI-SAFE-NEXT: s_mov_b32 s2, -1 -; SI-SAFE-NEXT: s_mov_b32 s10, s2 -; SI-SAFE-NEXT: s_mov_b32 s11, s3 -; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-SAFE-NEXT: s_mov_b32 s8, s6 -; SI-SAFE-NEXT: s_mov_b32 s9, s7 -; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 -; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 -; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: s_mov_b32 s0, 0xbff00000 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 -; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] -; SI-SAFE-NEXT: s_mov_b32 s0, s4 -; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-SAFE-NEXT: s_mov_b32 s1, s5 -; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 -; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-SAFE-NEXT: s_endpgm -; -; CI-SAFE-LABEL: neg_rsq_f64: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-SAFE-NEXT: s_mov_b32 s11, s7 -; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; CI-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-SAFE-NEXT: s_mov_b32 s5, s1 -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-SAFE-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] -; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], -1.0 -; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, -1.0, v[0:1], -1.0 -; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 -; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CI-SAFE-NEXT: s_endpgm - %val = load double, ptr addrspace(1) %in, align 4 - %sqrt = call double @llvm.sqrt.f64(double %val) - %div = fdiv double -1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void -} - -define amdgpu_kernel void @neg_rsq_neg_f64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) #0 { -; GCN-UNSAFE-LABEL: neg_rsq_neg_f64: -; GCN-UNSAFE: ; %bb.0: -; GCN-UNSAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GCN-UNSAFE-NEXT: s_mov_b32 s7, 0xf000 -; GCN-UNSAFE-NEXT: s_mov_b32 s6, -1 -; GCN-UNSAFE-NEXT: s_mov_b32 s10, s6 -; GCN-UNSAFE-NEXT: s_mov_b32 s11, s7 -; GCN-UNSAFE-NEXT: s_waitcnt lgkmcnt(0) -; GCN-UNSAFE-NEXT: s_mov_b32 s8, s2 -; GCN-UNSAFE-NEXT: s_mov_b32 s9, s3 -; GCN-UNSAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; GCN-UNSAFE-NEXT: s_mov_b32 s4, s0 -; GCN-UNSAFE-NEXT: s_mov_b32 s5, s1 -; GCN-UNSAFE-NEXT: s_waitcnt vmcnt(0) -; GCN-UNSAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] -; GCN-UNSAFE-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] -; GCN-UNSAFE-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 -; GCN-UNSAFE-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] -; GCN-UNSAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; GCN-UNSAFE-NEXT: s_endpgm -; -; SI-SAFE-LABEL: neg_rsq_neg_f64: -; SI-SAFE: ; %bb.0: -; SI-SAFE-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-SAFE-NEXT: s_mov_b32 s3, 0xf000 -; SI-SAFE-NEXT: s_mov_b32 s2, -1 -; SI-SAFE-NEXT: s_mov_b32 s10, s2 -; SI-SAFE-NEXT: s_mov_b32 s11, s3 -; SI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; SI-SAFE-NEXT: s_mov_b32 s8, s6 -; SI-SAFE-NEXT: s_mov_b32 s9, s7 -; SI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; SI-SAFE-NEXT: s_waitcnt vmcnt(0) -; SI-SAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] -; SI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 -; SI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; SI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SAFE-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 -; SI-SAFE-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 -; SI-SAFE-NEXT: s_mov_b32 s0, 0xbff00000 -; SI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SAFE-NEXT: v_cmp_eq_u32_e64 s[0:1], s0, v7 -; SI-SAFE-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SAFE-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] -; SI-SAFE-NEXT: s_mov_b32 s0, s4 -; SI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-SAFE-NEXT: s_mov_b32 s1, s5 -; SI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 -; SI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 -; SI-SAFE-NEXT: s_endpgm -; -; CI-SAFE-LABEL: neg_rsq_neg_f64: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; CI-SAFE-NEXT: s_mov_b32 s7, 0xf000 -; CI-SAFE-NEXT: s_mov_b32 s6, -1 -; CI-SAFE-NEXT: s_mov_b32 s10, s6 -; CI-SAFE-NEXT: s_mov_b32 s11, s7 -; CI-SAFE-NEXT: s_waitcnt lgkmcnt(0) -; CI-SAFE-NEXT: s_mov_b32 s8, s2 -; CI-SAFE-NEXT: s_mov_b32 s9, s3 -; CI-SAFE-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 -; CI-SAFE-NEXT: s_mov_b32 s4, s0 -; CI-SAFE-NEXT: s_mov_b32 s5, s1 -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) -; CI-SAFE-NEXT: v_sqrt_f64_e64 v[0:1], -v[0:1] -; CI-SAFE-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], -1.0 -; CI-SAFE-NEXT: v_div_scale_f64 v[8:9], vcc, -1.0, v[0:1], -1.0 -; CI-SAFE-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; CI-SAFE-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; CI-SAFE-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] -; CI-SAFE-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] -; CI-SAFE-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] -; CI-SAFE-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 -; CI-SAFE-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 -; CI-SAFE-NEXT: s_endpgm - %val = load double, ptr addrspace(1) %in, align 4 - %val.fneg = fneg double %val - %sqrt = call double @llvm.sqrt.f64(double %val.fneg) - %div = fdiv double -1.0, %sqrt - store double %div, ptr addrspace(1) %out, align 4 - ret void +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SDAG,SI-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GISEL,SI-GISEL %s + +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,SDAG,VI-SDAG %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GISEL,VI-GISEL %s + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.amdgcn.readfirstlane(i32) +declare double @llvm.sqrt.f64(double) +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) +declare double @llvm.amdgcn.sqrt.f64(double) +declare double @llvm.fabs.f64(double) + +define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { +; SI-SDAG-LABEL: s_rsq_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_rsq_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_rsq_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_rsq_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog + %rsq = call contract double @llvm.sqrt.f64(double %x) + %result = fdiv contract double 1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { +; SI-SDAG-LABEL: s_rsq_f64_fabs: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_rsq_f64_fabs: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_rsq_f64_fabs: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_rsq_f64_fabs: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |s[0:1]| +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog + %fabs.x = call double @llvm.fabs.f64(double %x) + %rsq = call contract double @llvm.sqrt.f64(double %fabs.x) + %result = fdiv contract double 1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { +; SI-SDAG-LABEL: s_neg_rsq_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_neg_rsq_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_neg_rsq_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_neg_rsq_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog + %rsq = call contract double @llvm.sqrt.f64(double %x) + %result = fdiv contract double -1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { +; SI-SDAG-LABEL: s_neg_rsq_neg_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s2, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_neg_rsq_neg_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_neg_rsq_neg_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_neg_rsq_neg_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -s[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog + %x.neg = fneg double %x + %rsq = call contract double @llvm.sqrt.f64(double %x.neg) + %result = fdiv contract double -1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_rsq_f64(double %x) { +; SI-SDAG-LABEL: v_rsq_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64_fabs(double %x) { +; SI-SDAG-LABEL: v_rsq_f64_fabs: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64_fabs: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64_fabs: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64_fabs: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], |v[0:1]| +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %fabs.x = call double @llvm.fabs.f64(double %x) + %sqrt = call contract double @llvm.sqrt.f64(double %fabs.x) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64_missing_contract0(double %x) { +; SI-SDAG-LABEL: v_rsq_f64_missing_contract0: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64_missing_contract0: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64_missing_contract0: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64_missing_contract0: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64_missing_contract1(double %x) { +; SI-SDAG-LABEL: v_rsq_f64_missing_contract1: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64_missing_contract1: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64_missing_contract1: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64_missing_contract1: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv double 1.0, %sqrt + ret double %rsq +} + +define double @v_neg_rsq_f64(double %x) { +; SI-SDAG-LABEL: v_neg_rsq_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract double -1.0, %sqrt + ret double %rsq +} + +define <2 x double> @v_rsq_v2f64(<2 x double> %x) { +; SI-SDAG-LABEL: v_rsq_v2f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_v2f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_v2f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_v2f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_div_scale_f64 v[12:13], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-GISEL-NEXT: s_mov_b64 vcc, s[4:5] +; VI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 +; VI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + %rsq = fdiv <2 x double> <double 1.0, double 1.0>, %sqrt + ret <2 x double> %rsq +} + +define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { +; SI-SDAG-LABEL: v_neg_rsq_v2f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_v2f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v20, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v13 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v20, v19 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], -1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_v2f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_v2f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-GISEL-NEXT: s_mov_b64 vcc, s[4:5] +; VI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + %rsq = fdiv <2 x double> <double -1.0, double -1.0>, %sqrt + ret <2 x double> %rsq +} + +define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { +; SI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; SI-SDAG-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], s[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, s4, v19 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], s[4:5] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_v2f64_poisonelt: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; VI-SDAG-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_v2f64_poisonelt: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-GISEL-NEXT: s_mov_b64 vcc, s[4:5] +; VI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], s[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + %rsq = fdiv <2 x double> <double -1.0, double poison>, %sqrt + ret <2 x double> %rsq +} + +define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { +; SI-SDAG-LABEL: v_neg_pos_rsq_v2f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[16:17], v[12:13] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-SDAG-NEXT: s_mov_b32 s4, 0x3ff00000 +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[18:19], v[10:11] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[12:13], v[18:19] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v19 +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; SI-SDAG-NEXT: s_nop 0 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[12:13] +; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_pos_rsq_v2f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mov_b32_e32 v16, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v16, v13 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v5 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: v_mul_f64 v[16:17], v[12:13], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[16:17], v[12:13] +; SI-GISEL-NEXT: v_fma_f64 v[14:15], -v[8:9], v[10:11], 1.0 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[10:11], v[14:15], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[16:17] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], v[18:19], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v9 +; SI-GISEL-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[18:19] +; SI-GISEL-NEXT: v_mov_b32_e32 v8, 0x3ff00000 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v8, v19 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[6:7], v[0:1], -1.0 +; SI-GISEL-NEXT: s_nop 1 +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[4:5], v[10:11] +; SI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[4:5], v[2:3], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_pos_rsq_v2f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; VI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-SDAG-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-SDAG-NEXT: s_mov_b64 vcc, s[4:5] +; VI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; VI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_pos_rsq_v2f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0 +; VI-GISEL-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[10:11], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_div_scale_f64 v[12:13], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[14:15], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[18:19], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], v[10:11], v[18:19], v[10:11] +; VI-GISEL-NEXT: v_mul_f64 v[14:15], v[12:13], v[8:9] +; VI-GISEL-NEXT: v_mul_f64 v[18:19], v[16:17], v[10:11] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[14:15], v[12:13] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[18:19], v[16:17] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[8:9], v[14:15] +; VI-GISEL-NEXT: s_mov_b64 vcc, s[4:5] +; VI-GISEL-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[10:11], v[18:19] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 +; VI-GISEL-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + %rsq = fdiv <2 x double> <double -1.0, double 1.0>, %sqrt + ret <2 x double> %rsq +} + +define double @v_rsq_f64_fneg_fabs(double %x) { +; SI-SDAG-LABEL: v_rsq_f64_fneg_fabs: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64_fneg_fabs: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e64 v[0:1], -|v[0:1]| +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %fabs = call double @llvm.fabs.f64(double %x) + %fneg.fabs = fneg double %fabs + %sqrt = call contract double @llvm.sqrt.f64(double %fneg.fabs) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn_sqrt(double %x) { +; SI-SDAG-LABEL: v_rsq_f64__afn_sqrt: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__afn_sqrt: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn_fdiv(double %x) { +; SDAG-LABEL: v_rsq_f64__afn_fdiv: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64__afn_fdiv: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn(double %x) { +; SDAG-LABEL: v_rsq_f64__afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64__afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn double 1.0, %sqrt + ret double %rsq +} + +define double @v_neg_rsq_f64__afn(double %x) { +; SDAG-LABEL: v_neg_rsq_f64__afn: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_neg_rsq_f64__afn: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn double -1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn_ninf(double %x) { +; SDAG-LABEL: v_rsq_f64__afn_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64__afn_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn ninf double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn ninf double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn_nnan(double %x) { +; SDAG-LABEL: v_rsq_f64__afn_nnan: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64__afn_nnan: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn nnan double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn nnan double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__afn_nnan_ninf(double %x) { +; SDAG-LABEL: v_rsq_f64__afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64__afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn nnan ninf double 1.0, %sqrt + ret double %rsq +} + +define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) { +; SDAG-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_mul_f64 v[4:5], v[2:3], -1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], -1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], -1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], -1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn nnan ninf double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract afn nnan ninf double -1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_f64__nnan_ninf(double %x) { +; SI-SDAG-LABEL: v_rsq_f64__nnan_ninf: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_f64__nnan_ninf: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract nnan ninf double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract nnan ninf double 1.0, %sqrt + ret double %rsq +} + +define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) { +; SDAG-LABEL: v_rsq_v2f64__afn_nnan_ninf: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[2:3] +; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SDAG-NEXT: v_fma_f64 v[8:9], -v[0:1], v[4:5], 1.0 +; SDAG-NEXT: v_fma_f64 v[10:11], -v[2:3], v[6:7], 1.0 +; SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[4:5] +; SDAG-NEXT: v_fma_f64 v[6:7], v[10:11], v[6:7], v[6:7] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[4:5] +; SDAG-NEXT: v_fma_f64 v[2:3], v[2:3], v[6:7], v[6:7] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1] +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1] +; SI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_v2f64__afn_nnan_ninf: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[4:5], v[0:1] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[6:7], v[2:3] +; VI-GISEL-NEXT: v_rsq_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[6:7], v[2:3], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[8:9], v[0:1], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[10:11], v[2:3], v[2:3] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], 1.0, v[0:1] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], 1.0, v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[6:7], v[10:11], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[8:9] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], v[6:7], v[2:3], v[10:11] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract afn nnan ninf <2 x double> @llvm.sqrt.v2f64(<2 x double> %x) + %rsq = fdiv contract afn nnan ninf <2 x double> <double 1.0, double 1.0>, %sqrt + ret <2 x double> %rsq +} + +define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 { +; SDAG-LABEL: s_rsq_f64_unsafe: +; SDAG: ; %bb.0: +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SDAG-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: s_rsq_f64_unsafe: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[2:3], s[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; GISEL-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[2:3] +; GISEL-NEXT: v_fma_f64 v[0:1], -v[0:1], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; GISEL-NEXT: ; return to shader part epilog + %rsq = call contract double @llvm.sqrt.f64(double %x) + %result = fdiv contract double 1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_rsq_f64_unsafe(double %x) #0 { +; SDAG-LABEL: v_rsq_f64_unsafe: +; SDAG: ; %bb.0: +; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SDAG-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SDAG-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SDAG-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-LABEL: v_rsq_f64_unsafe: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[0:1] +; GISEL-NEXT: v_rsq_f64_e32 v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_fma_f64 v[4:5], -v[2:3], v[0:1], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[4:5], v[0:1], v[0:1] +; GISEL-NEXT: v_mul_f64 v[4:5], 1.0, v[0:1] +; GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[4:5], 1.0 +; GISEL-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[4:5] +; GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call double @llvm.sqrt.f64(double %x) + %rsq = fdiv double 1.0, %sqrt + ret double %rsq +} + +define double @v_rsq_amdgcn_sqrt_f64(double %x) { +; SI-SDAG-LABEL: v_rsq_amdgcn_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_rsq_amdgcn_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_rsq_amdgcn_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_rsq_amdgcn_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x) + %rsq = fdiv contract double 1.0, %sqrt + ret double %rsq } + +define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) { +; SI-SDAG-LABEL: v_neg_rsq_amdgcn_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_neg_rsq_amdgcn_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0xbff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], -1.0, v[0:1], -1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_neg_rsq_amdgcn_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_neg_rsq_amdgcn_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, -1.0, v[0:1], -1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.amdgcn.sqrt.f64(double %x) + %rsq = fdiv contract double -1.0, %sqrt + ret double %rsq +} + +define amdgpu_ps <2 x i32> @s_rsq_amdgcn_sqrt_f64(double inreg %x) { +; SI-SDAG-LABEL: s_rsq_amdgcn_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; SI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; SI-SDAG-NEXT: ; return to shader part epilog +; +; SI-GISEL-LABEL: s_rsq_amdgcn_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[0:1], 1.0, v[0:1], 1.0 +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[0:1] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; SI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; SI-GISEL-NEXT: ; return to shader part epilog +; +; VI-SDAG-LABEL: s_rsq_amdgcn_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 +; VI-SDAG-NEXT: v_readfirstlane_b32 s1, v1 +; VI-SDAG-NEXT: ; return to shader part epilog +; +; VI-GISEL-LABEL: s_rsq_amdgcn_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, 1.0, v[0:1], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s0, v0 +; VI-GISEL-NEXT: v_readfirstlane_b32 s1, v1 +; VI-GISEL-NEXT: ; return to shader part epilog + %rsq = call contract double @llvm.amdgcn.sqrt.f64(double %x) + %result = fdiv contract double 1.0, %rsq + %cast = bitcast double %result to <2 x i32> + %cast.0 = extractelement <2 x i32> %cast, i32 0 + %cast.1 = extractelement <2 x i32> %cast, i32 1 + %lane.0 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.0) + %lane.1 = call i32 @llvm.amdgcn.readfirstlane(i32 %cast.1) + %insert.0 = insertelement <2 x i32> poison, i32 %lane.0, i32 0 + %insert.1 = insertelement <2 x i32> %insert.0, i32 %lane.1, i32 1 + ret <2 x i32> %insert.1 +} + +define double @v_div_contract_sqrt_f64(double %x, double %y) { +; SI-SDAG-LABEL: v_div_contract_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_div_contract_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_div_contract_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_div_contract_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %y) + %rsq = fdiv contract double %x, %sqrt + ret double %rsq +} + +define double @v_div_arcp_sqrt_f64(double %x, double %y) { +; SI-SDAG-LABEL: v_div_arcp_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_div_arcp_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_div_arcp_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_div_arcp_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call double @llvm.sqrt.f64(double %y) + %rsq = fdiv arcp double %x, %sqrt + ret double %rsq +} + +define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { +; SI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; SI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v3, v5 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-GISEL-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_div_contract_arcp_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[2:3], v[2:3] +; VI-GISEL-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_scale_f64 v[8:9], vcc, v[0:1], v[2:3], v[0:1] +; VI-GISEL-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; VI-GISEL-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] +; VI-GISEL-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %y) + %rsq = fdiv contract arcp double %x, %sqrt + ret double %rsq +} + +define double @v_div_const_contract_sqrt_f64(double %x) { +; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64: +; SI-SDAG: ; %bb.0: +; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-SDAG-NEXT: s_mov_b32 s6, 0 +; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000 +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], s[6:7], v[0:1], s[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s7, v7 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7] +; SI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64: +; SI-GISEL: ; %bb.0: +; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; SI-GISEL-NEXT: s_mov_b32 s6, 0 +; SI-GISEL-NEXT: s_mov_b32 s7, 0x40700000 +; SI-GISEL-NEXT: v_mov_b32_e32 v10, 0x40700000 +; SI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] +; SI-GISEL-NEXT: v_div_scale_f64 v[8:9], s[4:5], s[6:7], v[0:1], s[6:7] +; SI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v3 +; SI-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v10, v9 +; SI-GISEL-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; SI-GISEL-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5] +; SI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9] +; SI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7] +; SI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7] +; SI-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64: +; VI-SDAG: ; %bb.0: +; VI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-SDAG-NEXT: s_mov_b32 s4, 0 +; VI-SDAG-NEXT: s_mov_b32 s5, 0x40700000 +; VI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] +; VI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_div_scale_f64 v[6:7], vcc, s[4:5], v[0:1], s[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; VI-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64: +; VI-GISEL: ; %bb.0: +; VI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-GISEL-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] +; VI-GISEL-NEXT: s_mov_b32 s4, 0 +; VI-GISEL-NEXT: s_mov_b32 s5, 0x40700000 +; VI-GISEL-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] +; VI-GISEL-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; VI-GISEL-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_div_scale_f64 v[6:7], vcc, s[4:5], v[0:1], s[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; VI-GISEL-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; VI-GISEL-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; VI-GISEL-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; VI-GISEL-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; VI-GISEL-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] +; VI-GISEL-NEXT: s_setpc_b64 s[30:31] + %sqrt = call contract double @llvm.sqrt.f64(double %x) + %rsq = fdiv contract double 256.0, %sqrt + ret double %rsq +} + +attributes #0 = { "unsafe-fp-math"="true" } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CI: {{.*}} -; CI-UNSAFE: {{.*}} ; GCN: {{.*}} -; GCN-SAFE: {{.*}} -; SI: {{.*}} -; SI-UNSAFE: {{.*}} |