diff options
author | Sanjay Patel <spatel@rotateright.com> | 2022-02-07 10:12:12 -0500 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2022-02-07 10:12:12 -0500 |
commit | 40a50f8701a99a063a9950fc0a41f46934e4e160 (patch) | |
tree | d897b4b68951f4a37ec156e2c8cedd513ee157a9 /llvm/lib | |
parent | 3c33b20eaaef296a1da4044fc6add0c5e3ccae55 (diff) | |
download | llvm-40a50f8701a99a063a9950fc0a41f46934e4e160.zip llvm-40a50f8701a99a063a9950fc0a41f46934e4e160.tar.gz llvm-40a50f8701a99a063a9950fc0a41f46934e4e160.tar.bz2 |
[x86] avoid false dependency stall on 'sbb' with same source reg
This is effectively inverting the transform added with D116804
because the downside of the false dependency of something like
"sbb %eax, %eax" is much greater than the upside of eliminating
a zeroing instruction on (all?) Intel CPUs.
Differential Revision: https://reviews.llvm.org/D118843
Diffstat (limited to 'llvm/lib')
-rw-r--r-- | llvm/lib/Target/X86/X86.td | 13 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86ISelDAGToDAG.cpp | 42 | ||||
-rw-r--r-- | llvm/lib/Target/X86/X86Subtarget.h | 5 |
3 files changed, 43 insertions, 17 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 8e87481..2a23e99 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -445,6 +445,10 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt", "HasLZCNTFalseDeps", "true", "LZCNT/TZCNT have a false dependency on dest register">; +def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking", + "HasSBBDepBreaking", "true", + "SBB with same register has no source dependency">; + // On recent X86 (port bound) processors, its preferable to combine to a single shuffle // using a variable mask over multiple fixed shuffles. def TuningFastVariableCrossLaneShuffle @@ -1032,6 +1036,7 @@ def ProcessorFeatures { Feature64Bit]; list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // Bobcat @@ -1053,6 +1058,7 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningFastVectorShiftMasks, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // Jaguar @@ -1072,6 +1078,7 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningFastVectorShiftMasks, TuningFastMOVBE, + TuningSBBDepBreaking, TuningSlowSHLD]; list<SubtargetFeature> BtVer2Features = !listconcat(BtVer1Features, BtVer2AdditionalFeatures); @@ -1099,6 +1106,7 @@ def ProcessorFeatures { TuningFast11ByteNOP, TuningFastScalarShiftMasks, TuningBranchFusion, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; // PileDriver @@ -1174,6 +1182,7 @@ def ProcessorFeatures { TuningFastScalarShiftMasks, TuningFastMOVBE, TuningSlowSHLD, + TuningSBBDepBreaking, TuningInsertVZEROUPPER]; list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, @@ -1445,7 +1454,7 @@ foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV], [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16, - TuningInsertVZEROUPPER]>; + TuningSBBDepBreaking, TuningInsertVZEROUPPER]>; } foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { @@ -1453,7 +1462,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV, Feature64Bit], [TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16, - TuningInsertVZEROUPPER]>; + TuningSBBDepBreaking, TuningInsertVZEROUPPER]>; } foreach P = ["amdfam10", "barcelona"] in { diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 0c3cfaa..0d697f4 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -464,8 +464,13 @@ namespace { } // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = CurDAG->getCopyToReg( - CurDAG->getEntryNode(), dl, X86::EFLAGS, N->getOperand(2), SDValue()); + unsigned Opcode = N->getOpcode(); + assert(Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY && + "Unexpected opcode for SBB materialization"); + unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + N->getOperand(FlagOpIndex), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. @@ -5801,21 +5806,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; case X86ISD::SETCC_CARRY: { - // We have to do this manually because tblgen will put the eflags copy in - // the wrong place if we use an extract_subreg in the pattern. MVT VT = Node->getSimpleValueType(0); + SDValue Result; + if (Subtarget->hasSBBDepBreaking()) { + // We have to do this manually because tblgen will put the eflags copy in + // the wrong place if we use an extract_subreg in the pattern. + // Copy flags to the EFLAGS register and glue it to next node. + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + Node->getOperand(1), SDValue()); - // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = - CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, - Node->getOperand(1), SDValue()); - - // Create a 64-bit instruction if the result is 64-bits otherwise use the - // 32-bit version. - unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; - MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; - SDValue Result = SDValue( - CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; + MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + Result = SDValue( + CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), + 0); + } else { + // The target does not recognize sbb with the same reg operand as a + // no-source idiom, so we explicitly zero the input values. + Result = getSBBZero(Node); + } // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h index 5d773f0..d1ff944 100644 --- a/llvm/lib/Target/X86/X86Subtarget.h +++ b/llvm/lib/Target/X86/X86Subtarget.h @@ -246,6 +246,10 @@ class X86Subtarget final : public X86GenSubtargetInfo { /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. bool HasLZCNTFalseDeps = false; + /// True if an SBB instruction with same source register is recognized as + /// having no dependency on that register. + bool HasSBBDepBreaking = false; + /// True if its preferable to combine to a single cross-lane shuffle /// using a variable mask over multiple fixed shuffles. bool HasFastVariableCrossLaneShuffle = false; @@ -719,6 +723,7 @@ public: bool useLeaForSP() const { return UseLeaForSP; } bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } + bool hasSBBDepBreaking() const { return HasSBBDepBreaking; } bool hasFastVariableCrossLaneShuffle() const { return HasFastVariableCrossLaneShuffle; } |