aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86/X86ISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/X86/X86ISelLowering.cpp')
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp145
1 files changed, 139 insertions, 6 deletions
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 133406b..6edf018 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -33034,12 +33034,13 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
- : (Type *)FixedVectorType::get(ArgTy, 4);
+ : (Type *)FixedVectorType::get(ArgTy, 2);
TargetLowering::CallLoweringInfo CLI(DAG);
CLI.setDebugLoc(dl)
.setChain(DAG.getEntryNode())
- .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+ .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args))
+ .setIsPostTypeLegalization();
std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
@@ -53347,6 +53348,103 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Look for a RMW operation that only touches one bit of a larger than legal
+// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
+// i32 sub value.
+static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
+ SDValue StoredVal = St->getValue();
+ EVT VT = StoredVal.getValueType();
+
+ // Only narrow normal stores of larger than legal scalar integers.
+ if (!ISD::isNormalStore(St) || !St->isSimple() || !VT.isScalarInteger() ||
+ VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
+ return SDValue();
+
+ // BTR: X & ~(1 << ShAmt)
+ // BTS: X | (1 << ShAmt)
+ // BTC: X ^ (1 << ShAmt)
+ //
+ // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
+ SDValue SrcVal, InsertBit, ShAmt;
+ if (!StoredVal.hasOneUse() ||
+ !(sd_match(StoredVal, m_And(m_Value(SrcVal),
+ m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
+ sd_match(StoredVal,
+ m_Or(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(StoredVal,
+ m_Xor(m_Value(SrcVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
+ sd_match(
+ StoredVal,
+ m_Or(m_And(m_Value(SrcVal), m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
+ m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
+ return SDValue();
+
+ // SrcVal must be a matching normal load further up the chain.
+ auto *Ld = dyn_cast<LoadSDNode>(SrcVal);
+ if (!Ld || !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
+ Ld->getBasePtr() != St->getBasePtr() ||
+ Ld->getOffset() != St->getOffset() ||
+ !St->getChain().reachesChainWithoutSideEffects(SDValue(Ld, 1)))
+ return SDValue();
+
+ // Ensure the shift amount is in bounds.
+ KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+ if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
+ return SDValue();
+
+ // If we're inserting a bit then it must be the LSB.
+ if (InsertBit) {
+ KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
+ if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
+ return SDValue();
+ }
+
+ // Split the shift into an alignment shift that moves the active i32 block to
+ // the bottom bits for truncation and a modulo shift that can act on the i32.
+ EVT AmtVT = ShAmt.getValueType();
+ SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+ DAG.getSignedConstant(-32LL, DL, AmtVT));
+ SDValue ModuloAmt =
+ DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
+ ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
+
+ // Compute the byte offset for the i32 block that is changed by the RMW.
+ // combineTruncate will adjust the load for us in a similar way.
+ EVT PtrVT = St->getBasePtr().getValueType();
+ SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
+ SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
+ DAG.getShiftAmountConstant(3, PtrVT, DL));
+ SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
+ SDNodeFlags::NoUnsignedWrap);
+
+ // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
+ SDValue X = DAG.getNode(ISD::SRL, DL, VT, SrcVal, AlignAmt);
+ X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+
+ SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
+
+ SDValue Res;
+ if (InsertBit) {
+ SDValue BitMask =
+ DAG.getNode(ISD::SHL, DL, MVT::i32,
+ DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
+ Res =
+ DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
+ Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
+ } else {
+ if (StoredVal.getOpcode() == ISD::AND)
+ Mask = DAG.getNOT(DL, Mask, MVT::i32);
+ Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
+ }
+
+ return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
+ Align(), St->getMemOperand()->getFlags());
+}
+
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -53573,6 +53671,9 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
+ return R;
+
// Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
// store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54505,8 +54606,9 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
// truncation, see if we can convert the shift into a pointer offset instead.
// Limit this to normal (non-ext) scalar integer loads.
if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
- Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
- ISD::isNormalLoad(Src.getOperand(0).getNode())) {
+ Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
+ (Src.getOperand(0).hasOneUse() ||
+ !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
if (Ld->isSimple() && VT.isByteSized() &&
isPowerOf2_64(VT.getSizeInBits())) {
@@ -54529,8 +54631,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
SDValue NewLoad =
DAG.getLoad(VT, DL, Ld->getChain(), NewPtr, Ld->getPointerInfo(),
Align(), Ld->getMemOperand()->getFlags());
- DAG.ReplaceAllUsesOfValueWith(Src.getOperand(0).getValue(1),
- NewLoad.getValue(1));
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLoad);
return NewLoad;
}
}
@@ -56306,6 +56407,7 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ using namespace SDPatternMatch;
const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
const SDValue LHS = N->getOperand(0);
const SDValue RHS = N->getOperand(1);
@@ -56364,6 +56466,37 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
+ // If we're performing a bit test on a larger than legal type, attempt
+ // to (aligned) shift down the value to the bottom 32-bits and then
+ // perform the bittest on the i32 value.
+ // ICMP_ZERO(AND(X,SHL(1,IDX)))
+ // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
+ if (isNullConstant(RHS) &&
+ OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
+ SDValue X, ShAmt;
+ if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
+ m_Shl(m_One(), m_Value(ShAmt)))))) {
+ // Only attempt this if the shift amount is known to be in bounds.
+ KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
+ if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
+ EVT AmtVT = ShAmt.getValueType();
+ SDValue AlignAmt =
+ DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+ DAG.getSignedConstant(-32LL, DL, AmtVT));
+ SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
+ DAG.getConstant(31, DL, AmtVT));
+ SDValue Mask = DAG.getNode(
+ ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
+ DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
+ X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
+ X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
+ X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
+ return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
+ CC);
+ }
+ }
+ }
+
// cmpeq(trunc(x),C) --> cmpeq(x,C)
// cmpne(trunc(x),C) --> cmpne(x,C)
// iff x upper bits are zero.