aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp207
-rw-r--r--llvm/test/CodeGen/X86/O0-pipeline.ll1
-rw-r--r--llvm/test/CodeGen/X86/copy-eflags.ll108
3 files changed, 234 insertions, 82 deletions
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index a101f6b..1fd1c70 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -36,6 +36,7 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
@@ -98,6 +99,7 @@ private:
const X86InstrInfo *TII;
const TargetRegisterInfo *TRI;
const TargetRegisterClass *PromoteRC;
+ MachineDominatorTree *MDT;
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
MachineInstr &CopyDefI);
@@ -145,6 +147,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
char X86FlagsCopyLoweringPass::ID = 0;
void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -342,6 +345,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TII = Subtarget.getInstrInfo();
TRI = Subtarget.getRegisterInfo();
+ MDT = &getAnalysis<MachineDominatorTree>();
PromoteRC = &X86::GR8RegClass;
if (MF.begin() == MF.end())
@@ -416,103 +420,142 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
// of these up front instead.
CondRegArray CondRegs = collectCondsInRegs(TestMBB, CopyDefI);
- for (auto MII = std::next(CopyI->getIterator()), MIE = MBB.instr_end();
- MII != MIE;) {
- MachineInstr &MI = *MII++;
- MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
- if (!FlagUse) {
- if (MI.findRegisterDefOperand(X86::EFLAGS)) {
- // If EFLAGS are defined, it's as-if they were killed. We can stop
- // scanning here.
- //
- // NB!!! Many instructions only modify some flags. LLVM currently
- // models this as clobbering all flags, but if that ever changes this
- // will need to be carefully updated to handle that more complex
- // logic.
+ // Collect the basic blocks we need to scan. Typically this will just be
+ // a single basic block but we may have to scan multiple blocks if the
+ // EFLAGS copy lives into successors.
+ SmallVector<MachineBasicBlock *, 2> Blocks;
+ SmallPtrSet<MachineBasicBlock *, 2> VisitedBlocks;
+ Blocks.push_back(&MBB);
+ VisitedBlocks.insert(&MBB);
+
+ do {
+ MachineBasicBlock &UseMBB = *Blocks.pop_back_val();
+
+ // We currently don't do any PHI insertion and so we require that the
+ // test basic block dominates all of the use basic blocks.
+ //
+ // We could in theory do PHI insertion here if it becomes useful by just
+ // taking undef values in along every edge that we don't trace this
+ // EFLAGS copy along. This isn't as bad as fully general PHI insertion,
+ // but still seems like a great deal of complexity.
+ //
+ // Because it is theoretically possible that some earlier MI pass or
+ // other lowering transformation could induce this to happen, we do
+ // a hard check even in non-debug builds here.
+ if (&TestMBB != &UseMBB && !MDT->dominates(&TestMBB, &UseMBB)) {
+ DEBUG({
+ dbgs() << "ERROR: Encountered use that is not dominated by our test "
+ "basic block! Rewriting this would require inserting PHI "
+ "nodes to track the flag state across the CFG.\n\nTest "
+ "block:\n";
+ TestMBB.dump();
+ dbgs() << "Use block:\n";
+ UseMBB.dump();
+ });
+ report_fatal_error("Cannot lower EFLAGS copy when original copy def "
+ "does not dominate all uses.");
+ }
+
+ for (auto MII = &UseMBB == &MBB ? std::next(CopyI->getIterator())
+ : UseMBB.instr_begin(),
+ MIE = UseMBB.instr_end();
+ MII != MIE;) {
+ MachineInstr &MI = *MII++;
+ MachineOperand *FlagUse = MI.findRegisterUseOperand(X86::EFLAGS);
+ if (!FlagUse) {
+ if (MI.findRegisterDefOperand(X86::EFLAGS)) {
+ // If EFLAGS are defined, it's as-if they were killed. We can stop
+ // scanning here.
+ //
+ // NB!!! Many instructions only modify some flags. LLVM currently
+ // models this as clobbering all flags, but if that ever changes
+ // this will need to be carefully updated to handle that more
+ // complex logic.
+ FlagsKilled = true;
+ break;
+ }
+ continue;
+ }
+
+ DEBUG(dbgs() << " Rewriting use: "; MI.dump());
+
+ // Check the kill flag before we rewrite as that may change it.
+ if (FlagUse->isKill())
FlagsKilled = true;
+
+ // Once we encounter a branch, the rest of the instructions must also be
+ // branches. We can't rewrite in place here, so we handle them below.
+ //
+ // Note that we don't have to handle tail calls here, even conditional
+ // tail calls, as those are not introduced into the X86 MI until post-RA
+ // branch folding or black placement. As a consequence, we get to deal
+ // with the simpler formulation of conditional branches followed by tail
+ // calls.
+ if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ auto JmpIt = MI.getIterator();
+ do {
+ JmpIs.push_back(&*JmpIt);
+ ++JmpIt;
+ } while (JmpIt != UseMBB.instr_end() &&
+ X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
+ X86::COND_INVALID);
break;
}
- continue;
- }
- DEBUG(dbgs() << " Rewriting use: "; MI.dump());
-
- // Check the kill flag before we rewrite as that may change it.
- if (FlagUse->isKill())
- FlagsKilled = true;
+ // Otherwise we can just rewrite in-place.
+ if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
+ rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (X86::getCondFromSETOpc(MI.getOpcode()) !=
+ X86::COND_INVALID) {
+ rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (MI.getOpcode() == TargetOpcode::COPY) {
+ rewriteCopy(MI, *FlagUse, CopyDefI);
+ } else {
+ // We assume that arithmetic instructions that use flags also def
+ // them.
+ assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
+ "Expected a def of EFLAGS for this instruction!");
+
+ // NB!!! Several arithmetic instructions only *partially* update
+ // flags. Theoretically, we could generate MI code sequences that
+ // would rely on this fact and observe different flags independently.
+ // But currently LLVM models all of these instructions as clobbering
+ // all the flags in an undef way. We rely on that to simplify the
+ // logic.
+ FlagsKilled = true;
- // Once we encounter a branch, the rest of the instructions must also be
- // branches. We can't rewrite in place here, so we handle them below.
- //
- // Note that we don't have to handle tail calls here, even conditional
- // tail calls, as those are not introduced into the X86 MI until post-RA
- // branch folding or black placement. As a consequence, we get to deal
- // with the simpler formulation of conditional branches followed by tail
- // calls.
- if (X86::getCondFromBranchOpc(MI.getOpcode()) != X86::COND_INVALID) {
- auto JmpIt = MI.getIterator();
- do {
- JmpIs.push_back(&*JmpIt);
- ++JmpIt;
- } while (JmpIt != MBB.instr_end() &&
- X86::getCondFromBranchOpc(JmpIt->getOpcode()) !=
- X86::COND_INVALID);
- break;
- }
+ rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ break;
+ }
- // Otherwise we can just rewrite in-place.
- if (X86::getCondFromCMovOpc(MI.getOpcode()) != X86::COND_INVALID) {
- rewriteCMov(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
- } else if (X86::getCondFromSETOpc(MI.getOpcode()) != X86::COND_INVALID) {
- rewriteSetCC(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
- } else if (MI.getOpcode() == TargetOpcode::COPY) {
- rewriteCopy(MI, *FlagUse, CopyDefI);
- } else {
- // We assume that arithmetic instructions that use flags also def them.
- assert(MI.findRegisterDefOperand(X86::EFLAGS) &&
- "Expected a def of EFLAGS for this instruction!");
-
- // NB!!! Several arithmetic instructions only *partially* update
- // flags. Theoretically, we could generate MI code sequences that
- // would rely on this fact and observe different flags independently.
- // But currently LLVM models all of these instructions as clobbering
- // all the flags in an undef way. We rely on that to simplify the
- // logic.
- FlagsKilled = true;
-
- rewriteArithmetic(TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
- break;
+ // If this was the last use of the flags, we're done.
+ if (FlagsKilled)
+ break;
}
- // If this was the last use of the flags, we're done.
+ // If the flags were killed, we're done with this block.
if (FlagsKilled)
break;
- }
- // If we didn't find a kill (or equivalent) check that the flags don't
- // live-out of the basic block. Currently we don't support lowering copies
- // of flags that live out in this fashion.
- if (!FlagsKilled &&
- llvm::any_of(MBB.successors(), [](MachineBasicBlock *SuccMBB) {
- return SuccMBB->isLiveIn(X86::EFLAGS);
- })) {
- DEBUG({
- dbgs() << "ERROR: Found a copied EFLAGS live-out from basic block:\n"
- << "----\n";
- MBB.dump();
- dbgs() << "----\n"
- << "ERROR: Cannot lower this EFLAGS copy!\n";
- });
- report_fatal_error(
- "Cannot lower EFLAGS copy that lives out of a basic block!");
- }
+ // Otherwise we need to scan successors for ones where the flags live-in
+ // and queue those up for processing.
+ for (MachineBasicBlock *SuccMBB : UseMBB.successors())
+ if (SuccMBB->isLiveIn(X86::EFLAGS) &&
+ VisitedBlocks.insert(SuccMBB).second)
+ Blocks.push_back(SuccMBB);
+ } while (!Blocks.empty());
// Now rewrite the jumps that use the flags. These we handle specially
- // because if there are multiple jumps we'll have to do surgery on the CFG.
+ // because if there are multiple jumps in a single basic block we'll have
+ // to do surgery on the CFG.
+ MachineBasicBlock *LastJmpMBB = nullptr;
for (MachineInstr *JmpI : JmpIs) {
- // Past the first jump we need to split the blocks apart.
- if (JmpI != JmpIs.front())
+ // Past the first jump within a basic block we need to split the blocks
+ // apart.
+ if (JmpI->getParent() == LastJmpMBB)
splitBlock(*JmpI->getParent(), *JmpI, *TII);
+ else
+ LastJmpMBB = JmpI->getParent();
rewriteCondJmp(TestMBB, TestPos, TestLoc, *JmpI, CondRegs);
}
diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll
index f1f0d34..53707cb 100644
--- a/llvm/test/CodeGen/X86/O0-pipeline.ll
+++ b/llvm/test/CodeGen/X86/O0-pipeline.ll
@@ -37,6 +37,7 @@
; CHECK-NEXT: X86 PIC Global Base Reg Initialization
; CHECK-NEXT: Expand ISel Pseudo-instructions
; CHECK-NEXT: Local Stack Slot Allocation
+; CHECK-NEXT: MachineDominator Tree Construction
; CHECK-NEXT: X86 EFLAGS copy lowering
; CHECK-NEXT: X86 WinAlloca Expander
; CHECK-NEXT: Eliminate PHI nodes for register allocation
diff --git a/llvm/test/CodeGen/X86/copy-eflags.ll b/llvm/test/CodeGen/X86/copy-eflags.ll
index 07d13fe..9844799 100644
--- a/llvm/test/CodeGen/X86/copy-eflags.ll
+++ b/llvm/test/CodeGen/X86/copy-eflags.ll
@@ -196,3 +196,111 @@ else:
tail call void @external_b()
ret void
}
+
+; Test a function that gets special select lowering into CFG with copied EFLAGS
+; threaded across the CFG. This requires our EFLAGS copy rewriting to handle
+; cross-block rewrites in at least some narrow cases.
+define void @PR37100(i8 %arg1, i16 %arg2, i64 %arg3, i8 %arg4, i8* %ptr1, i32* %ptr2) {
+; X32-LABEL: PR37100:
+; X32: # %bb.0: # %bb
+; X32-NEXT: pushl %ebp
+; X32-NEXT: .cfi_def_cfa_offset 8
+; X32-NEXT: pushl %ebx
+; X32-NEXT: .cfi_def_cfa_offset 12
+; X32-NEXT: pushl %edi
+; X32-NEXT: .cfi_def_cfa_offset 16
+; X32-NEXT: pushl %esi
+; X32-NEXT: .cfi_def_cfa_offset 20
+; X32-NEXT: .cfi_offset %esi, -20
+; X32-NEXT: .cfi_offset %edi, -16
+; X32-NEXT: .cfi_offset %ebx, -12
+; X32-NEXT: .cfi_offset %ebp, -8
+; X32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT: movb {{[0-9]+}}(%esp), %ch
+; X32-NEXT: movb {{[0-9]+}}(%esp), %cl
+; X32-NEXT: jmp .LBB3_1
+; X32-NEXT: .p2align 4, 0x90
+; X32-NEXT: .LBB3_5: # %bb1
+; X32-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT: xorl %eax, %eax
+; X32-NEXT: xorl %edx, %edx
+; X32-NEXT: idivl %ebp
+; X32-NEXT: .LBB3_1: # %bb1
+; X32-NEXT: # =>This Inner Loop Header: Depth=1
+; X32-NEXT: movsbl %cl, %eax
+; X32-NEXT: movl %eax, %edx
+; X32-NEXT: sarl $31, %edx
+; X32-NEXT: cmpl %eax, %esi
+; X32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT: sbbl %edx, %eax
+; X32-NEXT: setl %al
+; X32-NEXT: setl %dl
+; X32-NEXT: movzbl %dl, %ebp
+; X32-NEXT: negl %ebp
+; X32-NEXT: testb $-1, %al
+; X32-NEXT: jne .LBB3_3
+; X32-NEXT: # %bb.2: # %bb1
+; X32-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT: movb %ch, %cl
+; X32-NEXT: .LBB3_3: # %bb1
+; X32-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT: movb %cl, (%ebx)
+; X32-NEXT: movl (%edi), %edx
+; X32-NEXT: testb $-1, %al
+; X32-NEXT: jne .LBB3_5
+; X32-NEXT: # %bb.4: # %bb1
+; X32-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X32-NEXT: movl %edx, %ebp
+; X32-NEXT: jmp .LBB3_5
+;
+; X64-LABEL: PR37100:
+; X64: # %bb.0: # %bb
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: jmp .LBB3_1
+; X64-NEXT: .p2align 4, 0x90
+; X64-NEXT: .LBB3_5: # %bb1
+; X64-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT: xorl %eax, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: idivl %esi
+; X64-NEXT: .LBB3_1: # %bb1
+; X64-NEXT: # =>This Inner Loop Header: Depth=1
+; X64-NEXT: movsbq %dil, %rax
+; X64-NEXT: xorl %esi, %esi
+; X64-NEXT: cmpq %rax, %r10
+; X64-NEXT: setl %sil
+; X64-NEXT: negl %esi
+; X64-NEXT: cmpq %rax, %r10
+; X64-NEXT: jl .LBB3_3
+; X64-NEXT: # %bb.2: # %bb1
+; X64-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT: movl %ecx, %edi
+; X64-NEXT: .LBB3_3: # %bb1
+; X64-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT: movb %dil, (%r8)
+; X64-NEXT: jl .LBB3_5
+; X64-NEXT: # %bb.4: # %bb1
+; X64-NEXT: # in Loop: Header=BB3_1 Depth=1
+; X64-NEXT: movl (%r9), %esi
+; X64-NEXT: jmp .LBB3_5
+bb:
+ br label %bb1
+
+bb1:
+ %tmp = phi i8 [ %tmp8, %bb1 ], [ %arg1, %bb ]
+ %tmp2 = phi i16 [ %tmp12, %bb1 ], [ %arg2, %bb ]
+ %tmp3 = icmp sgt i16 %tmp2, 7
+ %tmp4 = select i1 %tmp3, i16 %tmp2, i16 7
+ %tmp5 = sext i8 %tmp to i64
+ %tmp6 = icmp slt i64 %arg3, %tmp5
+ %tmp7 = sext i1 %tmp6 to i32
+ %tmp8 = select i1 %tmp6, i8 %tmp, i8 %arg4
+ store volatile i8 %tmp8, i8* %ptr1
+ %tmp9 = load volatile i32, i32* %ptr2
+ %tmp10 = select i1 %tmp6, i32 %tmp7, i32 %tmp9
+ %tmp11 = srem i32 0, %tmp10
+ %tmp12 = trunc i32 %tmp11 to i16
+ br label %bb1
+}