//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// /// \file /// This file implements the targeting of the Machinelegalizer class for X86. /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// #include "X86LegalizerInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Type.h" using namespace llvm; using namespace TargetOpcode; using namespace LegalizeActions; using namespace LegalityPredicates; X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM) : Subtarget(STI) { bool Is64Bit = Subtarget.is64Bit(); bool HasCMOV = Subtarget.canUseCMOV(); bool HasSSE1 = Subtarget.hasSSE1(); bool HasSSE2 = Subtarget.hasSSE2(); bool HasSSE41 = Subtarget.hasSSE41(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX2 = Subtarget.hasAVX2(); bool HasAVX512 = Subtarget.hasAVX512(); bool HasVLX = Subtarget.hasVLX(); bool HasDQI = Subtarget.hasAVX512() && Subtarget.hasDQI(); bool HasBWI = Subtarget.hasAVX512() && Subtarget.hasBWI(); bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); bool HasPOPCNT = Subtarget.hasPOPCNT(); bool HasLZCNT = Subtarget.hasLZCNT(); bool HasBMI = Subtarget.hasBMI(); const LLT p0 = LLT::pointer(0, TM.getPointerSizeInBits(0)); const LLT s1 = LLT::scalar(1); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); const LLT s80 = LLT::scalar(80); const LLT s128 = LLT::scalar(128); const LLT sMaxScalar = Subtarget.is64Bit() ? s64 : s32; const LLT v2s32 = LLT::fixed_vector(2, 32); const LLT v4s8 = LLT::fixed_vector(4, 8); const LLT v16s8 = LLT::fixed_vector(16, 8); const LLT v8s16 = LLT::fixed_vector(8, 16); const LLT v4s32 = LLT::fixed_vector(4, 32); const LLT v2s64 = LLT::fixed_vector(2, 64); const LLT v2p0 = LLT::fixed_vector(2, p0); const LLT v32s8 = LLT::fixed_vector(32, 8); const LLT v16s16 = LLT::fixed_vector(16, 16); const LLT v8s32 = LLT::fixed_vector(8, 32); const LLT v4s64 = LLT::fixed_vector(4, 64); const LLT v4p0 = LLT::fixed_vector(4, p0); const LLT v64s8 = LLT::fixed_vector(64, 8); const LLT v32s16 = LLT::fixed_vector(32, 16); const LLT v16s32 = LLT::fixed_vector(16, 32); const LLT v8s64 = LLT::fixed_vector(8, 64); const LLT s8MaxVector = HasAVX512 ? v64s8 : HasAVX ? v32s8 : v16s8; const LLT s16MaxVector = HasAVX512 ? v32s16 : HasAVX ? v16s16 : v8s16; const LLT s32MaxVector = HasAVX512 ? v16s32 : HasAVX ? v8s32 : v4s32; const LLT s64MaxVector = HasAVX512 ? v8s64 : HasAVX ? v4s64 : v2s64; // todo: AVX512 bool vector predicate types // implicit/constants // 32/64-bits needs support for s64/s128 to handle cases: // s64 = EXTEND (G_IMPLICIT_DEF s32) -> s64 = G_IMPLICIT_DEF // s128 = EXTEND (G_IMPLICIT_DEF s32/s64) -> s128 = G_IMPLICIT_DEF getActionDefinitionsBuilder(G_IMPLICIT_DEF) .legalFor({p0, s1, s8, s16, s32, s64}) .legalFor(Is64Bit, {s128}); getActionDefinitionsBuilder(G_CONSTANT) .legalFor({p0, s8, s16, s32}) .legalFor(Is64Bit, {s64}) .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar); getActionDefinitionsBuilder({G_LROUND, G_LLROUND, G_FCOS, G_FCOSH, G_FACOS, G_FSIN, G_FSINH, G_FASIN, G_FTAN, G_FTANH, G_FATAN, G_FATAN2, G_FPOW, G_FEXP, G_FEXP2, G_FEXP10, G_FLOG, G_FLOG2, G_FLOG10, G_FPOWI, G_FSINCOS, G_FCEIL, G_FFLOOR}) .libcall(); getActionDefinitionsBuilder(G_FSQRT) .legalFor(HasSSE1 || UseX87, {s32}) .legalFor(HasSSE2 || UseX87, {s64}) .legalFor(UseX87, {s80}); getActionDefinitionsBuilder({G_GET_ROUNDING, G_SET_ROUNDING}) .customFor({s32}); // merge/unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1; unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; getActionDefinitionsBuilder(Op) .widenScalarToNextPow2(LitTyIdx, /*Min=*/8) .widenScalarToNextPow2(BigTyIdx, /*Min=*/16) .minScalar(LitTyIdx, s8) .minScalar(BigTyIdx, s32) .legalIf([=](const LegalityQuery &Q) { switch (Q.Types[BigTyIdx].getSizeInBits()) { case 16: case 32: case 64: case 128: case 256: case 512: break; default: return false; } switch (Q.Types[LitTyIdx].getSizeInBits()) { case 8: case 16: case 32: case 64: case 128: case 256: return true; default: return false; } }); } getActionDefinitionsBuilder({G_UMIN, G_UMAX, G_SMIN, G_SMAX}) .widenScalarToNextPow2(0, /*Min=*/32) .lower(); // integer addition/subtraction getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64}) .legalFor(HasAVX2, {v32s8, v16s16, v8s32, v4s64}) .legalFor(HasAVX512, {v16s32, v8s64}) .legalFor(HasBWI, {v64s8, v32s16}) .clampMinNumElements(0, s8, 16) .clampMinNumElements(0, s16, 8) .clampMinNumElements(0, s32, 4) .clampMinNumElements(0, s64, 2) .clampMaxNumElements(0, s8, HasBWI ? 64 : (HasAVX2 ? 32 : 16)) .clampMaxNumElements(0, s16, HasBWI ? 32 : (HasAVX2 ? 16 : 8)) .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX2 ? 8 : 4)) .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX2 ? 4 : 2)) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .scalarize(0); getActionDefinitionsBuilder({G_UADDE, G_UADDO, G_USUBE, G_USUBO}) .legalFor({{s8, s1}, {s16, s1}, {s32, s1}}) .legalFor(Is64Bit, {{s64, s1}}) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .clampScalar(1, s1, s1) .scalarize(0); // integer multiply getActionDefinitionsBuilder(G_MUL) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .legalFor(HasSSE2, {v8s16}) .legalFor(HasSSE41, {v4s32}) .legalFor(HasAVX2, {v16s16, v8s32}) .legalFor(HasAVX512, {v16s32}) .legalFor(HasDQI, {v8s64}) .legalFor(HasDQI && HasVLX, {v2s64, v4s64}) .legalFor(HasBWI, {v32s16}) .clampMinNumElements(0, s16, 8) .clampMinNumElements(0, s32, 4) .clampMinNumElements(0, s64, HasVLX ? 2 : 8) .clampMaxNumElements(0, s16, HasBWI ? 32 : (HasAVX2 ? 16 : 8)) .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX2 ? 8 : 4)) .clampMaxNumElements(0, s64, 8) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .scalarize(0); getActionDefinitionsBuilder({G_SMULH, G_UMULH}) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .scalarize(0); // integer divisions getActionDefinitionsBuilder({G_SDIV, G_SREM, G_UDIV, G_UREM}) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .libcallFor({s64}) .clampScalar(0, s8, sMaxScalar); // integer shifts getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR}) .legalFor({{s8, s8}, {s16, s8}, {s32, s8}}) .legalFor(Is64Bit, {{s64, s8}}) .clampScalar(0, s8, sMaxScalar) .clampScalar(1, s8, s8); // integer logic getActionDefinitionsBuilder({G_AND, G_OR, G_XOR}) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .legalFor(HasSSE2, {v16s8, v8s16, v4s32, v2s64}) .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64}) .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64}) .clampMinNumElements(0, s8, 16) .clampMinNumElements(0, s16, 8) .clampMinNumElements(0, s32, 4) .clampMinNumElements(0, s64, 2) .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16)) .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8)) .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4)) .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2)) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .scalarize(0); // integer comparison const std::initializer_list IntTypes32 = {s8, s16, s32, p0}; const std::initializer_list IntTypes64 = {s8, s16, s32, s64, p0}; getActionDefinitionsBuilder(G_ICMP) .legalForCartesianProduct({s8}, Is64Bit ? IntTypes64 : IntTypes32) .clampScalar(0, s8, s8) .clampScalar(1, s8, sMaxScalar); // bswap getActionDefinitionsBuilder(G_BSWAP) .legalFor({s32}) .legalFor(Is64Bit, {s64}) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s32, sMaxScalar); // popcount getActionDefinitionsBuilder(G_CTPOP) .legalFor(HasPOPCNT, {{s16, s16}, {s32, s32}}) .legalFor(HasPOPCNT && Is64Bit, {{s64, s64}}) .widenScalarToNextPow2(1, /*Min=*/16) .clampScalar(1, s16, sMaxScalar) .scalarSameSizeAs(0, 1); // count leading zeros (LZCNT) getActionDefinitionsBuilder(G_CTLZ) .legalFor(HasLZCNT, {{s16, s16}, {s32, s32}}) .legalFor(HasLZCNT && Is64Bit, {{s64, s64}}) .widenScalarToNextPow2(1, /*Min=*/16) .clampScalar(1, s16, sMaxScalar) .scalarSameSizeAs(0, 1); // count trailing zeros getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF) .legalFor({{s16, s16}, {s32, s32}}) .legalFor(Is64Bit, {{s64, s64}}) .widenScalarToNextPow2(1, /*Min=*/16) .clampScalar(1, s16, sMaxScalar) .scalarSameSizeAs(0, 1); getActionDefinitionsBuilder(G_CTTZ) .legalFor(HasBMI, {{s16, s16}, {s32, s32}}) .legalFor(HasBMI && Is64Bit, {{s64, s64}}) .widenScalarToNextPow2(1, /*Min=*/16) .clampScalar(1, s16, sMaxScalar) .scalarSameSizeAs(0, 1); // control flow getActionDefinitionsBuilder(G_PHI) .legalFor({s8, s16, s32, p0}) .legalFor(UseX87, {s80}) .legalFor(Is64Bit, {s64}) .legalFor(HasSSE1, {v16s8, v8s16, v4s32, v2s64}) .legalFor(HasAVX, {v32s8, v16s16, v8s32, v4s64}) .legalFor(HasAVX512, {v64s8, v32s16, v16s32, v8s64}) .clampMinNumElements(0, s8, 16) .clampMinNumElements(0, s16, 8) .clampMinNumElements(0, s32, 4) .clampMinNumElements(0, s64, 2) .clampMaxNumElements(0, s8, HasAVX512 ? 64 : (HasAVX ? 32 : 16)) .clampMaxNumElements(0, s16, HasAVX512 ? 32 : (HasAVX ? 16 : 8)) .clampMaxNumElements(0, s32, HasAVX512 ? 16 : (HasAVX ? 8 : 4)) .clampMaxNumElements(0, s64, HasAVX512 ? 8 : (HasAVX ? 4 : 2)) .widenScalarToNextPow2(0, /*Min=*/32) .clampScalar(0, s8, sMaxScalar) .scalarize(0); getActionDefinitionsBuilder(G_BRCOND).legalFor({s1}); // pointer handling const std::initializer_list PtrTypes32 = {s1, s8, s16, s32}; const std::initializer_list PtrTypes64 = {s1, s8, s16, s32, s64}; getActionDefinitionsBuilder(G_PTRTOINT) .legalForCartesianProduct(Is64Bit ? PtrTypes64 : PtrTypes32, {p0}) .maxScalar(0, sMaxScalar) .widenScalarToNextPow2(0, /*Min*/ 8); getActionDefinitionsBuilder(G_INTTOPTR).legalFor({{p0, sMaxScalar}}); getActionDefinitionsBuilder(G_CONSTANT_POOL).legalFor({p0}); getActionDefinitionsBuilder(G_PTR_ADD) .legalFor({{p0, s32}}) .legalFor(Is64Bit, {{p0, s64}}) .widenScalarToNextPow2(1, /*Min*/ 32) .clampScalar(1, s32, sMaxScalar); getActionDefinitionsBuilder({G_FRAME_INDEX, G_GLOBAL_VALUE}).legalFor({p0}); // load/store: add more corner cases for (unsigned Op : {G_LOAD, G_STORE}) { auto &Action = getActionDefinitionsBuilder(Op); Action.legalForTypesWithMemDesc({{s8, p0, s8, 1}, {s16, p0, s16, 1}, {s32, p0, s32, 1}, {s80, p0, s80, 1}, {p0, p0, p0, 1}, {v4s8, p0, v4s8, 1}}); if (Is64Bit) Action.legalForTypesWithMemDesc( {{s64, p0, s64, 1}, {v2s32, p0, v2s32, 1}}); if (HasSSE1) Action.legalForTypesWithMemDesc({{v4s32, p0, v4s32, 1}}); if (HasSSE2) Action.legalForTypesWithMemDesc({{v16s8, p0, v16s8, 1}, {v8s16, p0, v8s16, 1}, {v2s64, p0, v2s64, 1}, {v2p0, p0, v2p0, 1}}); if (HasAVX) Action.legalForTypesWithMemDesc({{v32s8, p0, v32s8, 1}, {v16s16, p0, v16s16, 1}, {v8s32, p0, v8s32, 1}, {v4s64, p0, v4s64, 1}, {v4p0, p0, v4p0, 1}}); if (HasAVX512) Action.legalForTypesWithMemDesc({{v64s8, p0, v64s8, 1}, {v32s16, p0, v32s16, 1}, {v16s32, p0, v16s32, 1}, {v8s64, p0, v8s64, 1}}); // X86 supports extending loads but not stores for GPRs if (Op == G_LOAD) { Action.legalForTypesWithMemDesc({{s8, p0, s1, 1}, {s16, p0, s8, 1}, {s32, p0, s8, 1}, {s32, p0, s16, 1}}); if (Is64Bit) Action.legalForTypesWithMemDesc( {{s64, p0, s8, 1}, {s64, p0, s16, 1}, {s64, p0, s32, 1}}); } else { Action.customIf([=](const LegalityQuery &Query) { return Query.Types[0] != Query.MMODescrs[0].MemoryTy; }); } Action.widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar) .scalarize(0); } for (unsigned Op : {G_SEXTLOAD, G_ZEXTLOAD}) { auto &Action = getActionDefinitionsBuilder(Op); Action.legalForTypesWithMemDesc( {{s16, p0, s8, 1}, {s32, p0, s8, 1}, {s32, p0, s16, 1}}); if (Is64Bit) Action.legalForTypesWithMemDesc( {{s64, p0, s8, 1}, {s64, p0, s16, 1}, {s64, p0, s32, 1}}); // TODO - SSE41/AVX2/AVX512F/AVX512BW vector extensions } // sext, zext, and anyext getActionDefinitionsBuilder(G_ANYEXT) .legalFor({s8, s16, s32, s128}) .legalFor(Is64Bit, {s64}) .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar) .widenScalarToNextPow2(1, /*Min=*/8) .clampScalar(1, s8, sMaxScalar) .scalarize(0); getActionDefinitionsBuilder({G_SEXT, G_ZEXT}) .legalFor({s8, s16, s32}) .legalFor(Is64Bit, {s64}) .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar) .widenScalarToNextPow2(1, /*Min=*/8) .clampScalar(1, s8, sMaxScalar) .scalarize(0); getActionDefinitionsBuilder(G_SEXT_INREG).lower(); // fp constants getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({s32, s64}) .legalFor(UseX87, {s80}); // fp arithmetic getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV}) .legalFor({s32, s64}) .legalFor(HasSSE1, {v4s32}) .legalFor(HasSSE2, {v2s64}) .legalFor(HasAVX, {v8s32, v4s64}) .legalFor(HasAVX512, {v16s32, v8s64}) .legalFor(UseX87, {s80}); getActionDefinitionsBuilder(G_FABS) .legalFor(UseX87, {s80}) .legalFor(UseX87 && !Is64Bit, {s64}) .lower(); // fp comparison getActionDefinitionsBuilder(G_FCMP) .legalFor(HasSSE1 || UseX87, {s8, s32}) .legalFor(HasSSE2 || UseX87, {s8, s64}) .legalFor(UseX87, {s8, s80}) .clampScalar(0, s8, s8) .clampScalar(1, s32, HasSSE2 ? s64 : s32) .widenScalarToNextPow2(1); // fp conversions getActionDefinitionsBuilder(G_FPEXT) .legalFor(HasSSE2, {{s64, s32}}) .legalFor(HasAVX, {{v4s64, v4s32}}) .legalFor(HasAVX512, {{v8s64, v8s32}}); getActionDefinitionsBuilder(G_FPTRUNC) .legalFor(HasSSE2, {{s32, s64}}) .legalFor(HasAVX, {{v4s32, v4s64}}) .legalFor(HasAVX512, {{v8s32, v8s64}}); getActionDefinitionsBuilder(G_SITOFP) .legalFor(HasSSE1, {{s32, s32}}) .legalFor(HasSSE1 && Is64Bit, {{s32, s64}}) .legalFor(HasSSE2, {{s64, s32}}) .legalFor(HasSSE2 && Is64Bit, {{s64, s64}}) .clampScalar(1, (UseX87 && !HasSSE1) ? s16 : s32, sMaxScalar) .widenScalarToNextPow2(1) .customForCartesianProduct(UseX87, {s32, s64, s80}, {s16, s32, s64}) .clampScalar(0, s32, HasSSE2 ? s64 : s32) .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FPTOSI) .legalFor(HasSSE1, {{s32, s32}}) .legalFor(HasSSE1 && Is64Bit, {{s64, s32}}) .legalFor(HasSSE2, {{s32, s64}}) .legalFor(HasSSE2 && Is64Bit, {{s64, s64}}) .clampScalar(0, (UseX87 && !HasSSE1) ? s16 : s32, sMaxScalar) .widenScalarToNextPow2(0) .customForCartesianProduct(UseX87, {s16, s32, s64}, {s32, s64, s80}) .clampScalar(1, s32, HasSSE2 ? s64 : s32) .widenScalarToNextPow2(1); // For G_UITOFP and G_FPTOUI without AVX512, we have to custom legalize types // <= s32 manually. Otherwise, in custom handler there is no way to // understand whether s32 is an original type and we need to promote it to // s64 or s32 is obtained after widening and we shouldn't widen it to s64. // // For AVX512 we simply widen types as there is direct mapping from opcodes // to asm instructions. getActionDefinitionsBuilder(G_UITOFP) .legalFor(HasAVX512, {{s32, s32}, {s32, s64}, {s64, s32}, {s64, s64}}) .customIf([=](const LegalityQuery &Query) { return !HasAVX512 && ((HasSSE1 && typeIs(0, s32)(Query)) || (HasSSE2 && typeIs(0, s64)(Query))) && scalarNarrowerThan(1, Is64Bit ? 64 : 32)(Query); }) .lowerIf([=](const LegalityQuery &Query) { // Lower conversions from s64 return !HasAVX512 && ((HasSSE1 && typeIs(0, s32)(Query)) || (HasSSE2 && typeIs(0, s64)(Query))) && (Is64Bit && typeIs(1, s64)(Query)); }) .clampScalar(0, s32, HasSSE2 ? s64 : s32) .widenScalarToNextPow2(0) .clampScalar(1, s32, sMaxScalar) .widenScalarToNextPow2(1); getActionDefinitionsBuilder(G_FPTOUI) .legalFor(HasAVX512, {{s32, s32}, {s32, s64}, {s64, s32}, {s64, s64}}) .customIf([=](const LegalityQuery &Query) { return !HasAVX512 && ((HasSSE1 && typeIs(1, s32)(Query)) || (HasSSE2 && typeIs(1, s64)(Query))) && scalarNarrowerThan(0, Is64Bit ? 64 : 32)(Query); }) // TODO: replace with customized legalization using // specifics of cvttsd2si. The selection of this node requires // a vector type. Either G_SCALAR_TO_VECTOR is needed or more advanced // support of G_BUILD_VECTOR/G_INSERT_VECTOR_ELT is required beforehand. .lowerIf([=](const LegalityQuery &Query) { return !HasAVX512 && ((HasSSE1 && typeIs(1, s32)(Query)) || (HasSSE2 && typeIs(1, s64)(Query))) && (Is64Bit && typeIs(0, s64)(Query)); }) .clampScalar(0, s32, sMaxScalar) .widenScalarToNextPow2(0) .clampScalar(1, s32, HasSSE2 ? s64 : s32) .widenScalarToNextPow2(1); // vector ops getActionDefinitionsBuilder(G_BUILD_VECTOR) .customIf([=](const LegalityQuery &Query) { return (HasSSE1 && typeInSet(0, {v4s32})(Query)) || (HasSSE2 && typeInSet(0, {v2s64, v8s16, v16s8})(Query)) || (HasAVX && typeInSet(0, {v4s64, v8s32, v16s16, v32s8})(Query)) || (HasAVX512 && typeInSet(0, {v8s64, v16s32, v32s16, v64s8})); }) .clampNumElements(0, v16s8, s8MaxVector) .clampNumElements(0, v8s16, s16MaxVector) .clampNumElements(0, v4s32, s32MaxVector) .clampNumElements(0, v2s64, s64MaxVector) .moreElementsToNextPow2(0); getActionDefinitionsBuilder({G_EXTRACT, G_INSERT}) .legalIf([=](const LegalityQuery &Query) { unsigned SubIdx = Query.Opcode == G_EXTRACT ? 0 : 1; unsigned FullIdx = Query.Opcode == G_EXTRACT ? 1 : 0; return (HasAVX && typePairInSet(SubIdx, FullIdx, {{v16s8, v32s8}, {v8s16, v16s16}, {v4s32, v8s32}, {v2s64, v4s64}})(Query)) || (HasAVX512 && typePairInSet(SubIdx, FullIdx, {{v16s8, v64s8}, {v32s8, v64s8}, {v8s16, v32s16}, {v16s16, v32s16}, {v4s32, v16s32}, {v8s32, v16s32}, {v2s64, v8s64}, {v4s64, v8s64}})(Query)); }); // todo: only permit dst types up to max legal vector register size? getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalFor( HasSSE1, {{v32s8, v16s8}, {v16s16, v8s16}, {v8s32, v4s32}, {v4s64, v2s64}}) .legalFor(HasAVX, {{v64s8, v16s8}, {v64s8, v32s8}, {v32s16, v8s16}, {v32s16, v16s16}, {v16s32, v4s32}, {v16s32, v8s32}, {v8s64, v2s64}, {v8s64, v4s64}}); // todo: vectors and address spaces getActionDefinitionsBuilder(G_SELECT) .legalFor({{s8, s32}, {s16, s32}, {s32, s32}, {s64, s32}, {p0, s32}}) .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, HasCMOV ? s16 : s8, sMaxScalar) .clampScalar(1, s32, s32); // memory intrinsics getActionDefinitionsBuilder({G_MEMCPY, G_MEMMOVE, G_MEMSET}).libcall(); getActionDefinitionsBuilder({G_DYN_STACKALLOC, G_STACKSAVE, G_STACKRESTORE}) .lower(); // fp intrinsics getActionDefinitionsBuilder({G_INTRINSIC_ROUNDEVEN, G_INTRINSIC_TRUNC}) .scalarize(0) .minScalar(0, LLT::scalar(32)) .libcall(); getActionDefinitionsBuilder({G_FREEZE, G_CONSTANT_FOLD_BARRIER}) .legalFor({s8, s16, s32, s64, p0}) .widenScalarToNextPow2(0, /*Min=*/8) .clampScalar(0, s8, sMaxScalar); getLegacyLegalizerInfo().computeTables(); verify(*STI.getInstrInfo()); } bool X86LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineRegisterInfo &MRI = *MIRBuilder.getMRI(); switch (MI.getOpcode()) { default: // No idea what to do. return false; case TargetOpcode::G_BUILD_VECTOR: return legalizeBuildVector(MI, MRI, Helper); case TargetOpcode::G_FPTOUI: return legalizeFPTOUI(MI, MRI, Helper); case TargetOpcode::G_UITOFP: return legalizeUITOFP(MI, MRI, Helper); case TargetOpcode::G_STORE: return legalizeNarrowingStore(MI, MRI, Helper); case TargetOpcode::G_SITOFP: return legalizeSITOFP(MI, MRI, Helper); case TargetOpcode::G_FPTOSI: return legalizeFPTOSI(MI, MRI, Helper); case TargetOpcode::G_GET_ROUNDING: return legalizeGETROUNDING(MI, MRI, Helper); case TargetOpcode::G_SET_ROUNDING: return legalizeSETROUNDING(MI, MRI, Helper); } llvm_unreachable("expected switch to return"); } bool X86LegalizerInfo::legalizeSITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineFunction &MF = *MI.getMF(); auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); assert((SrcTy.getSizeInBits() == 16 || SrcTy.getSizeInBits() == 32 || SrcTy.getSizeInBits() == 64) && "Unexpected source type for SITOFP in X87 mode."); TypeSize MemSize = SrcTy.getSizeInBytes(); MachinePointerInfo PtrInfo; Align Alignmt = Helper.getStackTemporaryAlignment(SrcTy); auto SlotPointer = Helper.createStackTemporary(MemSize, Alignmt, PtrInfo); MachineMemOperand *StoreMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MemSize, Align(MemSize)); // Store the integer value on the FPU stack. MIRBuilder.buildStore(Src, SlotPointer, *StoreMMO); MachineMemOperand *LoadMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, MemSize, Align(MemSize)); MIRBuilder.buildInstr(X86::G_FILD) .addDef(Dst) .addUse(SlotPointer.getReg(0)) .addMemOperand(LoadMMO); MI.eraseFromParent(); return true; } bool X86LegalizerInfo::legalizeFPTOSI(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineFunction &MF = *MI.getMF(); MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); TypeSize MemSize = DstTy.getSizeInBytes(); MachinePointerInfo PtrInfo; Align Alignmt = Helper.getStackTemporaryAlignment(DstTy); auto SlotPointer = Helper.createStackTemporary(MemSize, Alignmt, PtrInfo); MachineMemOperand *StoreMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, MemSize, Align(MemSize)); MIRBuilder.buildInstr(X86::G_FIST) .addUse(Src) .addUse(SlotPointer.getReg(0)) .addMemOperand(StoreMMO); MIRBuilder.buildLoad(Dst, SlotPointer, PtrInfo, Align(MemSize)); MI.eraseFromParent(); return true; } bool X86LegalizerInfo::legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; const auto &BuildVector = cast(MI); Register Dst = BuildVector.getReg(0); LLT DstTy = MRI.getType(Dst); MachineFunction &MF = MIRBuilder.getMF(); LLVMContext &Ctx = MF.getFunction().getContext(); uint64_t DstTySize = DstTy.getScalarSizeInBits(); SmallVector CstIdxs; for (unsigned i = 0; i < BuildVector.getNumSources(); ++i) { Register Source = BuildVector.getSourceReg(i); auto ValueAndReg = getIConstantVRegValWithLookThrough(Source, MRI); if (ValueAndReg) { CstIdxs.emplace_back(ConstantInt::get(Ctx, ValueAndReg->Value)); continue; } auto FPValueAndReg = getFConstantVRegValWithLookThrough(Source, MRI); if (FPValueAndReg) { CstIdxs.emplace_back(ConstantFP::get(Ctx, FPValueAndReg->Value)); continue; } if (getOpcodeDef(Source, MRI)) { CstIdxs.emplace_back(UndefValue::get(Type::getIntNTy(Ctx, DstTySize))); continue; } return false; } Constant *ConstVal = ConstantVector::get(CstIdxs); const DataLayout &DL = MIRBuilder.getDataLayout(); unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace(); Align Alignment(DL.getABITypeAlign(ConstVal->getType())); auto Addr = MIRBuilder.buildConstantPool( LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace)), MF.getConstantPool()->getConstantPoolIndex(ConstVal, Alignment)); MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, DstTy, Alignment); MIRBuilder.buildLoad(Dst, Addr, *MMO); MI.eraseFromParent(); return true; } bool X86LegalizerInfo::legalizeFPTOUI(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); unsigned DstSizeInBits = DstTy.getScalarSizeInBits(); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); // Simply reuse FPTOSI when it is possible to widen the type if (DstSizeInBits <= 32) { auto Casted = MIRBuilder.buildFPTOSI(DstTy == s32 ? s64 : s32, Src); MIRBuilder.buildTrunc(Dst, Casted); MI.eraseFromParent(); return true; } return false; } bool X86LegalizerInfo::legalizeUITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs(); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); // Simply reuse SITOFP when it is possible to widen the type if (SrcTy.getSizeInBits() <= 32) { auto Ext = MIRBuilder.buildZExt(SrcTy == s32 ? s64 : s32, Src); MIRBuilder.buildSITOFP(Dst, Ext); MI.eraseFromParent(); return true; } return false; } bool X86LegalizerInfo::legalizeNarrowingStore(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { auto &Store = cast(MI); MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineMemOperand &MMO = **Store.memoperands_begin(); MachineFunction &MF = MIRBuilder.getMF(); LLT ValTy = MRI.getType(Store.getValueReg()); auto *NewMMO = MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), ValTy); Helper.Observer.changingInstr(Store); Store.setMemRefs(MF, {NewMMO}); Helper.Observer.changedInstr(Store); return true; } bool X86LegalizerInfo::legalizeGETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { /* The rounding mode is in bits 11:10 of FPSR, and has the following settings: 00 Round to nearest 01 Round to -inf 10 Round to +inf 11 Round to 0 GET_ROUNDING, on the other hand, expects the following: -1 Undefined 0 Round to 0 1 Round to nearest 2 Round to +inf 3 Round to -inf To perform the conversion, we use a packed lookup table of the four 2-bit values that we can index by FPSP[11:10] 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10] (0x2d >> ((FPSR >> 9) & 6)) & 3 */ MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineFunction &MF = MIRBuilder.getMF(); Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); // Save FP Control Word to stack slot int MemSize = 2; Align Alignment = Align(2); MachinePointerInfo PtrInfo; auto StackTemp = Helper.createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo); Register StackPtr = StackTemp.getReg(0); auto StoreMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, MemSize, Alignment); // Store FP Control Word to stack slot using G_FNSTCW16 MIRBuilder.buildInstr(X86::G_FNSTCW16) .addUse(StackPtr) .addMemOperand(StoreMMO); // Load FP Control Word from stack slot auto LoadMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, MemSize, Alignment); auto CWD32 = MIRBuilder.buildZExt(s32, MIRBuilder.buildLoad(s16, StackPtr, *LoadMMO)); auto Shifted8 = MIRBuilder.buildTrunc( s8, MIRBuilder.buildLShr(s32, CWD32, MIRBuilder.buildConstant(s8, 9))); auto Masked32 = MIRBuilder.buildZExt( s32, MIRBuilder.buildAnd(s8, Shifted8, MIRBuilder.buildConstant(s8, 6))); // LUT is a packed lookup table (0x2d) used to map the 2-bit x87 FPU rounding // mode (from bits 11:10 of the control word) to the values expected by // GET_ROUNDING. The mapping is performed by shifting LUT right by the // extracted rounding mode and masking the result with 3 to obtain the final auto LUT = MIRBuilder.buildConstant(s32, 0x2d); auto LUTShifted = MIRBuilder.buildLShr(s32, LUT, Masked32); auto RetVal = MIRBuilder.buildAnd(s32, LUTShifted, MIRBuilder.buildConstant(s32, 3)); auto RetValTrunc = MIRBuilder.buildZExtOrTrunc(DstTy, RetVal); MIRBuilder.buildCopy(Dst, RetValTrunc); MI.eraseFromParent(); return true; } bool X86LegalizerInfo::legalizeSETROUNDING(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const { MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; MachineFunction &MF = MIRBuilder.getMF(); Register Src = MI.getOperand(0).getReg(); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); // Allocate stack slot for control word and MXCSR (4 bytes). int MemSize = 4; Align Alignment = Align(4); MachinePointerInfo PtrInfo; auto StackTemp = Helper.createStackTemporary(TypeSize::getFixed(MemSize), Alignment, PtrInfo); Register StackPtr = StackTemp.getReg(0); auto StoreMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 2, Align(2)); MIRBuilder.buildInstr(X86::G_FNSTCW16) .addUse(StackPtr) .addMemOperand(StoreMMO); auto LoadMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 2, Align(2)); auto CWD16 = MIRBuilder.buildLoad(s16, StackPtr, *LoadMMO); // Clear RM field (bits 11:10) auto ClearedCWD = MIRBuilder.buildAnd(s16, CWD16, MIRBuilder.buildConstant(s16, 0xf3ff)); // Check if Src is a constant auto *SrcDef = MRI.getVRegDef(Src); Register RMBits; Register MXCSRRMBits; if (SrcDef && SrcDef->getOpcode() == TargetOpcode::G_CONSTANT) { uint64_t RM = getIConstantFromReg(Src, MRI).getZExtValue(); int FieldVal = X86::getRoundingModeX86(RM); if (FieldVal == X86::rmInvalid) { FieldVal = X86::rmToNearest; LLVMContext &C = MF.getFunction().getContext(); C.diagnose(DiagnosticInfoUnsupported( MF.getFunction(), "rounding mode is not supported by X86 hardware", DiagnosticLocation(MI.getDebugLoc()), DS_Error)); return false; } FieldVal = FieldVal << 3; RMBits = MIRBuilder.buildConstant(s16, FieldVal).getReg(0); MXCSRRMBits = MIRBuilder.buildConstant(s32, FieldVal).getReg(0); } else { // Convert Src (rounding mode) to bits for control word // (0xc9 << (2 * Src + 4)) & 0xc00 auto Src32 = MIRBuilder.buildZExtOrTrunc(s32, Src); auto ShiftAmt = MIRBuilder.buildAdd( s32, MIRBuilder.buildShl(s32, Src32, MIRBuilder.buildConstant(s32, 1)), MIRBuilder.buildConstant(s32, 4)); auto ShiftAmt8 = MIRBuilder.buildTrunc(s8, ShiftAmt); auto Shifted = MIRBuilder.buildShl(s16, MIRBuilder.buildConstant(s16, 0xc9), ShiftAmt8); RMBits = MIRBuilder.buildAnd(s16, Shifted, MIRBuilder.buildConstant(s16, 0xc00)) .getReg(0); // For non-constant case, we still need to compute MXCSR bits dynamically auto RMBits32 = MIRBuilder.buildZExt(s32, RMBits); MXCSRRMBits = MIRBuilder.buildShl(s32, RMBits32, MIRBuilder.buildConstant(s32, 3)) .getReg(0); } // Update rounding mode bits auto NewCWD = MIRBuilder.buildOr(s16, ClearedCWD, RMBits, MachineInstr::Disjoint); // Store new FP Control Word to stack auto StoreNewMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 2, Align(2)); MIRBuilder.buildStore(NewCWD, StackPtr, *StoreNewMMO); // Load FP control word from the slot using G_FLDCW16 auto LoadNewMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 2, Align(2)); MIRBuilder.buildInstr(X86::G_FLDCW16) .addUse(StackPtr) .addMemOperand(LoadNewMMO); if (Subtarget.hasSSE1()) { // Store MXCSR to stack (use STMXCSR) auto StoreMXCSRMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, 4, Align(4)); MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) .addIntrinsicID(Intrinsic::x86_sse_stmxcsr) .addUse(StackPtr) .addMemOperand(StoreMXCSRMMO); // Load MXCSR from stack auto LoadMXCSRMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, 4, Align(4)); auto MXCSR = MIRBuilder.buildLoad(s32, StackPtr, *LoadMXCSRMMO); // Clear RM field (bits 14:13) auto ClearedMXCSR = MIRBuilder.buildAnd( s32, MXCSR, MIRBuilder.buildConstant(s32, 0xffff9fff)); // Update rounding mode bits auto NewMXCSR = MIRBuilder.buildOr(s32, ClearedMXCSR, MXCSRRMBits); // Store new MXCSR to stack auto StoreNewMXCSRMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOStore, 4, Align(4)); MIRBuilder.buildStore(NewMXCSR, StackPtr, *StoreNewMXCSRMMO); // Load MXCSR from stack (use LDMXCSR) auto LoadNewMXCSRMMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad, 4, Align(4)); MIRBuilder.buildInstr(TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS) .addIntrinsicID(Intrinsic::x86_sse_ldmxcsr) .addUse(StackPtr) .addMemOperand(LoadNewMXCSRMMO); } MI.eraseFromParent(); return true; } bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { return true; }