From e3fd8f83a801b1918508c7c0a71cc31bc95ad4d2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sun, 20 Nov 2022 08:40:25 -0800 Subject: AMDGPU: Correctly expand f64 sqrt intrinsic rocm-device-libs and llpc were avoiding using f64 sqrt intrinsics in favor of their own expansions. Port the expansion into the backend. Both of these users should be updated to call the intrinsic instead. The library and llpc expansions are slightly different. llpc uses an ldexp to do the scale; the library uses a multiply. Use ldexp to do the scale instead of the multiply. I believe v_ldexp_f64 and v_mul_f64 are always the same number of cycles, but it's cheaper to materialize the 32-bit integer constant than the 64-bit double constant. The libraries have another fast version of sqrt which will be handled separately. I am tempted to do this in an IR expansion instead. In the IR we could take advantage of computeKnownFPClass to avoid the 0-or-inf argument check. --- llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'llvm/include') diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h index a1ff764..5341b57 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h @@ -1181,6 +1181,13 @@ public: const SrcOp &Op0, const SrcOp &Op1, std::optional Flags = std::nullopt); + /// Build and insert a \p Res = G_IS_FPCLASS \p Pred\p Src, \p Mask + MachineInstrBuilder buildIsFPClass(const DstOp &Res, const SrcOp &Src, + unsigned Mask) { + return buildInstr(TargetOpcode::G_IS_FPCLASS, {Res}, + {Src, SrcOp(static_cast(Mask))}); + } + /// Build and insert a \p Res = G_SELECT \p Tst, \p Op0, \p Op1 /// /// \pre setBasicBlock or setMI must have been called. -- cgit v1.1