aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp207
1 files changed, 121 insertions, 86 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b97b508..168e041 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2572,8 +2572,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
// Combine sin / cos into _sincos_stret if it is available.
- setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
- setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
if (Subtarget.isTargetWin64()) {
setOperationAction(ISD::SDIV, MVT::i128, Custom);
@@ -30908,6 +30908,63 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR);
}
+ if (VT == MVT::v64i8 && Subtarget.canExtendTo512BW()) {
+ // On AVX512BW, we can use variable 16-bit shifts to implement variable
+ // 8-bit shifts. For this, we split the input into two vectors, RLo and RHi.
+ // The i-th lane of RLo contains the (2*i)-th lane of R, and the i-th lane
+ // of RHi contains the (2*i+1)-th lane of R. After shifting, these vectors
+ // can efficiently be merged together using a masked move.
+ MVT ExtVT = MVT::v32i16;
+
+ SDValue RLo, RHi;
+ // Isolate lower and upper lanes of Amt by masking odd lanes in AmtLo and
+ // right shifting AmtHi.
+ SDValue AmtLo = DAG.getNode(ISD::AND, dl, ExtVT, DAG.getBitcast(ExtVT, Amt),
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ SDValue AmtHi = getTargetVShiftByConstNode(
+ X86ISD::VSRLI, dl, ExtVT, DAG.getBitcast(ExtVT, Amt), 8, DAG);
+ switch (Opc) {
+ case ISD::SHL:
+ // Because we shift left, no bits from the high half can influence the low
+ // half, so we don't need to mask RLo. We do however need to mask RHi, to
+ // prevent high bits of an even lane overflowing into low bits of an odd
+ // lane.
+ RLo = DAG.getBitcast(ExtVT, R);
+ RHi = DAG.getNode(ISD::AND, dl, ExtVT, RLo,
+ DAG.getConstant(0xff00, dl, ExtVT));
+ break;
+ case ISD::SRL:
+ // Same idea as above, but this time we need to make sure no low bits of
+ // an odd lane can overflow into high bits of an even lane.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = DAG.getNode(ISD::AND, dl, ExtVT, RHi,
+ DAG.getConstant(0x00ff, dl, ExtVT));
+ break;
+ case ISD::SRA:
+ // For arithmetic right shifts, we want to sign extend each even lane of R
+ // such that the upper half of the corresponding lane of RLo is 0 or -1
+ // depending on the sign bit of the original lane. We do this using 2
+ // immediate shifts.
+ RHi = DAG.getBitcast(ExtVT, R);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, RHi, 8, DAG);
+ RLo = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExtVT, RLo, 8, DAG);
+ break;
+ default:
+ llvm_unreachable("Unexpected Shift Op");
+ }
+
+ SDValue ShiftedLo =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RLo, AmtLo));
+ SDValue ShiftedHi =
+ DAG.getBitcast(VT, DAG.getNode(Opc, dl, ExtVT, RHi, AmtHi));
+
+ // To merge the shifted vectors back together, we select even lanes
+ // from ShiftedLo and odd lanes from ShiftedHi.
+ SDValue SelectMask = DAG.getBitcast(
+ MVT::v64i1, DAG.getConstant(0x5555555555555555, dl, MVT::i64));
+ return DAG.getSelect(dl, VT, SelectMask, ShiftedLo, ShiftedHi);
+ }
+
if (VT == MVT::v16i8 ||
(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
(VT == MVT::v64i8 && Subtarget.hasBWI())) {
@@ -33004,61 +33061,6 @@ static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
}
-static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- SDValue Arg = Op.getOperand(0);
- EVT ArgVT = Arg.getValueType();
- bool isF64 = ArgVT == MVT::f64;
-
- RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
- const char *LibcallName = TLI.getLibcallName(LC);
- if (!LibcallName)
- return SDValue();
-
- assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
-
- // For MacOSX, we want to call an alternative entry point: __sincos_stret,
- // which returns the values as { float, float } (in XMM0) or
- // { double, double } (which is returned in XMM0, XMM1).
- SDLoc dl(Op);
- Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
- TargetLowering::ArgListTy Args;
- Args.emplace_back(Arg, ArgTy);
-
- // Only optimize x86_64 for now. i386 is a bit messy. For f32,
- // the small struct {f32, f32} is returned in (eax, edx). For f64,
- // the results are returned via SRet in memory.
- SDValue Callee =
- DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
-
- Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)FixedVectorType::get(ArgTy, 2);
-
- TargetLowering::CallLoweringInfo CLI(DAG);
- CLI.setDebugLoc(dl)
- .setChain(DAG.getEntryNode())
- .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
- .setIsPostTypeLegalization();
-
- std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-
- if (isF64)
- // Returned in xmm0 and xmm1.
- return CallResult.first;
-
- // Returned in bits 0:31 and 32:64 xmm0.
- SDValue SinVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(0, dl));
- SDValue CosVal =
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, CallResult.first,
- DAG.getVectorIdxConstant(1, dl));
- SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
- return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
-}
-
/// Widen a vector input to a vector of NVT. The
/// input vector must have the same element type as NVT.
static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
@@ -33663,7 +33665,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::ABDS:
case ISD::ABDU: return LowerABD(Op, Subtarget, DAG);
case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG);
- case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
@@ -53349,40 +53350,44 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
}
// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS pattern acting on a single i32 sub value.
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
SDValue StoredVal = St->getValue();
EVT VT = StoredVal.getValueType();
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
return SDValue();
// BTR: X & ~(1 << ShAmt)
// BTS: X | (1 << ShAmt)
// BTC: X ^ (1 << ShAmt)
- SDValue ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
+ //
+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+ SDValue SrcVal, InsertBit, ShAmt;
+ if (!(sd_match(StoredVal, m_And(m_Value(SrcVal),
m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt))))))
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();
// Ensure the shift amount is in bounds.
@@ -53390,6 +53395,13 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
return SDValue();
+ // If we're inserting a bit then it must be the LSB.
+ if (InsertBit) {
+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+ return SDValue();
+ }
+
// Split the shift into an alignment shift that moves the active i32 block to
// the bottom bits for truncation and a modulo shift that can act on the i32.
EVT AmtVT = ShAmt.getValueType();
@@ -53397,6 +53409,7 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
DAG.getSignedConstant(-32LL, DL, AmtVT));
SDValue ModuloAmt =
DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
// Compute the byte offset for the i32 block that is changed by the RMW.
// combineTruncate will adjust the load for us in a similar way.
@@ -53408,18 +53421,39 @@ static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
SDNodeFlags::NoUnsignedWrap);
// Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- SDValue Mask =
- DAG.getNode(ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
- SDValue Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
+ SDValue Res;
+ if (InsertBit) {
+ SDValue BitMask =
+ DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+ Res =
+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+ } else {
+ if (StoredVal.getOpcode() == ISD::AND)
+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+ }
+
+ SDValue NewStore =
+ DAG.getStore(St->getChain(), DL, Res, NewPtr,
+ MachinePointerInfo(St->getPointerInfo().getAddrSpace()),
+ Align(), St->getMemOperand()->getFlags());
+
+ // If there are other uses of StoredVal, replace with a new load of the
+ // whole (updated) value.
+ if (!StoredVal.hasOneUse()) {
+ SDValue NewLoad =
+ DAG.getLoad(VT, DL, NewStore, Ld->getBasePtr(), Ld->getMemOperand());
+ DAG.ReplaceAllUsesWith(StoredVal, NewLoad);
+ }
+ return NewStore;
}
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
@@ -54606,7 +54640,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue NewPtr = DAG.getMemBasePlusOffset(
Ld->getBasePtr(), PtrByteOfs, DL, SDNodeFlags::NoUnsignedWrap);
SDValue NewLoad =
- DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
+ DAG.getLoad(VT, DL, Ld->getChain(), NewPtr,
+ MachinePointerInfo(Ld->getPointerInfo().getAddrSpace()),
Align(), Ld->getMemOperand()->getFlags());
DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return NewLoad;