diff options
author | Sanjay Patel <spatel@rotateright.com> | 2017-06-08 16:53:18 +0000 |
---|---|---|
committer | Sanjay Patel <spatel@rotateright.com> | 2017-06-08 16:53:18 +0000 |
commit | e7c5041c2ae10b4da0526d5abdc5cbcd647f1b45 (patch) | |
tree | 3761f6a721bd3583f07f40fb2fdb3bd7fe9767c0 /llvm/lib/CodeGen/CodeGenPrepare.cpp | |
parent | 8cb1d0931f2749a9731e1a59f869a509baa83640 (diff) | |
download | llvm-e7c5041c2ae10b4da0526d5abdc5cbcd647f1b45.zip llvm-e7c5041c2ae10b4da0526d5abdc5cbcd647f1b45.tar.gz llvm-e7c5041c2ae10b4da0526d5abdc5cbcd647f1b45.tar.bz2 |
[CGP / PowerPC] avoid multi-block overhead for simple memcmp expansion
The test diff for PowerPC shows we can better optimize if this case is one block.
For x86, there's would be a substantial difference if CGP expansion was enabled because branches are assumed
cheap and SDAG can't optimize across blocks.
Instead of this:
_cmp_eq8:
movq (%rdi), %rax
cmpq (%rsi), %rax
je LBB23_1
## BB#2: ## %res_block
movl $1, %ecx
jmp LBB23_3
LBB23_1:
xorl %ecx, %ecx
LBB23_3: ## %endblock
xorl %eax, %eax
testl %ecx, %ecx
sete %al
retq
We get this:
cmp_eq8:
movq (%rdi), %rcx
xorl %eax, %eax
cmpq (%rsi), %rcx
sete %al
retq
And that matches the optimal codegen that we get from the current expansion in SelectionDAGBuilder::visitMemCmpCall().
If this looks right, then I just need to confirm that vector-sized expansion will work from here, and we can enable
CGP memcmp() expansion for x86. Ie, we'll bypass the power-of-2 special cases currently optimized in SDAG because we
can lower the IR produced here optimally.
Differential Revision: https://reviews.llvm.org/D34005
llvm-svn: 304987
Diffstat (limited to 'llvm/lib/CodeGen/CodeGenPrepare.cpp')
-rw-r--r-- | llvm/lib/CodeGen/CodeGenPrepare.cpp | 63 |
1 files changed, 42 insertions, 21 deletions
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index e797f1a..b9e0e21 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1675,6 +1675,7 @@ class MemCmpExpansion { void emitLoadCompareByteBlock(unsigned Index, int GEPIndex); void emitMemCmpResultBlock(bool IsLittleEndian); Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian); + Value *getMemCmpEqZeroOneBlock(unsigned Size); unsigned getLoadSize(unsigned Size); unsigned getNumLoads(unsigned Size); @@ -1699,31 +1700,35 @@ MemCmpExpansion::MemCmpExpansion(CallInst *CI, uint64_t Size, unsigned MaxLoadSize, unsigned LoadsPerBlock) : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(LoadsPerBlock) { - IRBuilder<> Builder(CI->getContext()); - BasicBlock *StartBlock = CI->getParent(); - EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); - setupEndBlockPHINodes(); + // A memcmp with zero-comparison with only one block of load and compare does + // not need to set up any extra blocks. This case could be handled in the DAG, + // but since we have all of the machinery to flexibly expand any memcpy here, + // we choose to handle this case too to avoid fragmented lowering. IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI); - - // Calculate how many load compare blocks are required for an expansion of - // given Size. NumBlocks = calculateNumBlocks(Size); - createResultBlock(); + if (!IsUsedForZeroCmp || NumBlocks != 1) { + BasicBlock *StartBlock = CI->getParent(); + EndBlock = StartBlock->splitBasicBlock(CI, "endblock"); + setupEndBlockPHINodes(); + createResultBlock(); + + // If return value of memcmp is not used in a zero equality, we need to + // calculate which source was larger. The calculation requires the + // two loaded source values of each load compare block. + // These will be saved in the phi nodes created by setupResultBlockPHINodes. + if (!IsUsedForZeroCmp) + setupResultBlockPHINodes(); - // If return value of memcmp is not used in a zero equality, we need to - // calculate which source was larger. The calculation requires the - // two loaded source values of each load compare block. - // These will be saved in the phi nodes created by setupResultBlockPHINodes. - if (!IsUsedForZeroCmp) - setupResultBlockPHINodes(); + // Create the number of required load compare basic blocks. + createLoadCmpBlocks(); - // Create the number of required load compare basic blocks. - createLoadCmpBlocks(); + // Update the terminator added by splitBasicBlock to branch to the first + // LoadCmpBlock. + StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); + } - // Update the terminator added by splitBasicBlock to branch to the first - // LoadCmpBlock. + IRBuilder<> Builder(CI->getContext()); Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]); } void MemCmpExpansion::createLoadCmpBlocks() { @@ -1810,7 +1815,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned Index, unsigned Size, unsigned NumLoadsRemaining = getNumLoads(RemainingBytes); unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock); - Builder.SetInsertPoint(LoadCmpBlocks[Index]); + // For a single-block expansion, start inserting before the memcmp call. + if (LoadCmpBlocks.empty()) + Builder.SetInsertPoint(CI); + else + Builder.SetInsertPoint(LoadCmpBlocks[Index]); + Value *Cmp = nullptr; for (unsigned i = 0; i < NumLoads; ++i) { unsigned LoadSize = getLoadSize(RemainingBytes); @@ -2071,11 +2081,22 @@ Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size, return PhiRes; } +/// A memcmp expansion that compares equality with 0 and only has one block of +/// load and compare can bypass the compare, branch, and phi IR that is required +/// in the general case. +Value *MemCmpExpansion::getMemCmpEqZeroOneBlock(unsigned Size) { + unsigned NumBytesProcessed = 0; + IRBuilder<> Builder(CI->getContext()); + Value *Cmp = getCompareLoadPairs(0, Size, NumBytesProcessed, Builder); + return Builder.CreateZExt(Cmp, Type::getInt32Ty(CI->getContext())); +} + // This function expands the memcmp call into an inline expansion and returns // the memcmp result. Value *MemCmpExpansion::getMemCmpExpansion(uint64_t Size, bool IsLittleEndian) { if (IsUsedForZeroCmp) - return getMemCmpExpansionZeroCase(Size, IsLittleEndian); + return NumBlocks == 1 ? getMemCmpEqZeroOneBlock(Size) : + getMemCmpExpansionZeroCase(Size, IsLittleEndian); // This loop calls emitLoadCompareBlock for comparing Size bytes of the two // memcmp sources. It starts with loading using the maximum load size set by |