aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib
diff options
context:
space:
mode:
authorSanjay Patel <spatel@rotateright.com>2022-02-07 10:12:12 -0500
committerSanjay Patel <spatel@rotateright.com>2022-02-07 10:12:12 -0500
commit40a50f8701a99a063a9950fc0a41f46934e4e160 (patch)
treed897b4b68951f4a37ec156e2c8cedd513ee157a9 /llvm/lib
parent3c33b20eaaef296a1da4044fc6add0c5e3ccae55 (diff)
downloadllvm-40a50f8701a99a063a9950fc0a41f46934e4e160.zip
llvm-40a50f8701a99a063a9950fc0a41f46934e4e160.tar.gz
llvm-40a50f8701a99a063a9950fc0a41f46934e4e160.tar.bz2
[x86] avoid false dependency stall on 'sbb' with same source reg
This is effectively inverting the transform added with D116804 because the downside of the false dependency of something like "sbb %eax, %eax" is much greater than the upside of eliminating a zeroing instruction on (all?) Intel CPUs. Differential Revision: https://reviews.llvm.org/D118843
Diffstat (limited to 'llvm/lib')
-rw-r--r--llvm/lib/Target/X86/X86.td13
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp42
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h5
3 files changed, 43 insertions, 17 deletions
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8e87481..2a23e99 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -445,6 +445,10 @@ def TuningLZCNTFalseDeps : SubtargetFeature<"false-deps-lzcnt-tzcnt",
"HasLZCNTFalseDeps", "true",
"LZCNT/TZCNT have a false dependency on dest register">;
+def TuningSBBDepBreaking : SubtargetFeature<"sbb-dep-breaking",
+ "HasSBBDepBreaking", "true",
+ "SBB with same register has no source dependency">;
+
// On recent X86 (port bound) processors, its preferable to combine to a single shuffle
// using a variable mask over multiple fixed shuffles.
def TuningFastVariableCrossLaneShuffle
@@ -1032,6 +1036,7 @@ def ProcessorFeatures {
Feature64Bit];
list<SubtargetFeature> BarcelonaTuning = [TuningFastScalarShiftMasks,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// Bobcat
@@ -1053,6 +1058,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// Jaguar
@@ -1072,6 +1078,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastVectorShiftMasks,
TuningFastMOVBE,
+ TuningSBBDepBreaking,
TuningSlowSHLD];
list<SubtargetFeature> BtVer2Features =
!listconcat(BtVer1Features, BtVer2AdditionalFeatures);
@@ -1099,6 +1106,7 @@ def ProcessorFeatures {
TuningFast11ByteNOP,
TuningFastScalarShiftMasks,
TuningBranchFusion,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
// PileDriver
@@ -1174,6 +1182,7 @@ def ProcessorFeatures {
TuningFastScalarShiftMasks,
TuningFastMOVBE,
TuningSlowSHLD,
+ TuningSBBDepBreaking,
TuningInsertVZEROUPPER];
list<SubtargetFeature> ZN2AdditionalFeatures = [FeatureCLWB,
FeatureRDPID,
@@ -1445,7 +1454,7 @@ foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureCMPXCHG8B, FeatureSSE2, Feature3DNowA,
FeatureFXSR, FeatureNOPL, Feature64Bit, FeatureCMOV],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
- TuningInsertVZEROUPPER]>;
+ TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
@@ -1453,7 +1462,7 @@ foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B, FeatureCMOV,
Feature64Bit],
[TuningFastScalarShiftMasks, TuningSlowSHLD, TuningSlowUAMem16,
- TuningInsertVZEROUPPER]>;
+ TuningSBBDepBreaking, TuningInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 0c3cfaa..0d697f4 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -464,8 +464,13 @@ namespace {
}
// Copy flags to the EFLAGS register and glue it to next node.
- SDValue EFLAGS = CurDAG->getCopyToReg(
- CurDAG->getEntryNode(), dl, X86::EFLAGS, N->getOperand(2), SDValue());
+ unsigned Opcode = N->getOpcode();
+ assert(Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY &&
+ "Unexpected opcode for SBB materialization");
+ unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1;
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ N->getOperand(FlagOpIndex), SDValue());
// Create a 64-bit instruction if the result is 64-bits otherwise use the
// 32-bit version.
@@ -5801,21 +5806,28 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
break;
case X86ISD::SETCC_CARRY: {
- // We have to do this manually because tblgen will put the eflags copy in
- // the wrong place if we use an extract_subreg in the pattern.
MVT VT = Node->getSimpleValueType(0);
+ SDValue Result;
+ if (Subtarget->hasSBBDepBreaking()) {
+ // We have to do this manually because tblgen will put the eflags copy in
+ // the wrong place if we use an extract_subreg in the pattern.
+ // Copy flags to the EFLAGS register and glue it to next node.
+ SDValue EFLAGS =
+ CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+ Node->getOperand(1), SDValue());
- // Copy flags to the EFLAGS register and glue it to next node.
- SDValue EFLAGS =
- CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
- Node->getOperand(1), SDValue());
-
- // Create a 64-bit instruction if the result is 64-bits otherwise use the
- // 32-bit version.
- unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
- MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
- SDValue Result = SDValue(
- CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+ // Create a 64-bit instruction if the result is 64-bits otherwise use the
+ // 32-bit version.
+ unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+ MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+ Result = SDValue(
+ CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)),
+ 0);
+ } else {
+ // The target does not recognize sbb with the same reg operand as a
+ // no-source idiom, so we explicitly zero the input values.
+ Result = getSBBZero(Node);
+ }
// For less than 32-bits we need to extract from the 32-bit node.
if (VT == MVT::i8 || VT == MVT::i16) {
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index 5d773f0..d1ff944 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -246,6 +246,10 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// True if LZCNT/TZCNT instructions have a false dependency on the destination register.
bool HasLZCNTFalseDeps = false;
+ /// True if an SBB instruction with same source register is recognized as
+ /// having no dependency on that register.
+ bool HasSBBDepBreaking = false;
+
/// True if its preferable to combine to a single cross-lane shuffle
/// using a variable mask over multiple fixed shuffles.
bool HasFastVariableCrossLaneShuffle = false;
@@ -719,6 +723,7 @@ public:
bool useLeaForSP() const { return UseLeaForSP; }
bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; }
bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; }
+ bool hasSBBDepBreaking() const { return HasSBBDepBreaking; }
bool hasFastVariableCrossLaneShuffle() const {
return HasFastVariableCrossLaneShuffle;
}