aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Analysis/VectorUtils.cpp
diff options
context:
space:
mode:
authorJames Molloy <james.molloy@arm.com>2015-10-12 12:34:45 +0000
committerJames Molloy <james.molloy@arm.com>2015-10-12 12:34:45 +0000
commit55d633bd602358c68ada16c9197ae444d363507d (patch)
treed4bdbca6ec273df345d17af5458f066e94a2fe98 /llvm/lib/Analysis/VectorUtils.cpp
parentfb677dfd73c02bce99fabbdff2fab1e98acb32c0 (diff)
downloadllvm-55d633bd602358c68ada16c9197ae444d363507d.zip
llvm-55d633bd602358c68ada16c9197ae444d363507d.tar.gz
llvm-55d633bd602358c68ada16c9197ae444d363507d.tar.bz2
[LoopVectorize] Shrink integer operations into the smallest type possible
C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
Diffstat (limited to 'llvm/lib/Analysis/VectorUtils.cpp')
-rw-r--r--llvm/lib/Analysis/VectorUtils.cpp130
1 files changed, 130 insertions, 0 deletions
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 9372085..4153c84 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -11,9 +11,12 @@
//
//===----------------------------------------------------------------------===//
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/PatternMatch.h"
@@ -434,3 +437,130 @@ llvm::Value *llvm::getSplatValue(Value *V) {
return InsertEltInst->getOperand(1);
}
+
+DenseMap<Instruction*, uint64_t> llvm::computeMinimumValueSizes(
+ ArrayRef<BasicBlock*> Blocks, DemandedBits &DB,
+ const TargetTransformInfo *TTI) {
+
+ // DemandedBits will give us every value's live-out bits. But we want
+ // to ensure no extra casts would need to be inserted, so every DAG
+ // of connected values must have the same minimum bitwidth.
+ EquivalenceClasses<Value*> ECs;
+ SmallVector<Value*,16> Worklist;
+ SmallPtrSet<Value*,4> Roots;
+ SmallPtrSet<Value*,16> Visited;
+ DenseMap<Value*,uint64_t> DBits;
+ SmallPtrSet<Instruction*,4> InstructionSet;
+ DenseMap<Instruction*, uint64_t> MinBWs;
+
+ // Determine the roots. We work bottom-up, from truncs or icmps.
+ bool SeenExtFromIllegalType = false;
+ for (auto *BB : Blocks)
+ for (auto &I : *BB) {
+ InstructionSet.insert(&I);
+
+ if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
+ !TTI->isTypeLegal(I.getOperand(0)->getType()))
+ SeenExtFromIllegalType = true;
+
+ // Only deal with non-vector integers up to 64-bits wide.
+ if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
+ !I.getType()->isVectorTy() &&
+ I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
+ // Don't make work for ourselves. If we know the loaded type is legal,
+ // don't add it to the worklist.
+ if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
+ continue;
+
+ Worklist.push_back(&I);
+ Roots.insert(&I);
+ }
+ }
+ // Early exit.
+ if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
+ return MinBWs;
+
+ // Now proceed breadth-first, unioning values together.
+ while (!Worklist.empty()) {
+ Value *Val = Worklist.pop_back_val();
+ Value *Leader = ECs.getOrInsertLeaderValue(Val);
+
+ if (Visited.count(Val))
+ continue;
+ Visited.insert(Val);
+
+ // Non-instructions terminate a chain successfully.
+ if (!isa<Instruction>(Val))
+ continue;
+ Instruction *I = cast<Instruction>(Val);
+
+ // If we encounter a type that is larger than 64 bits, we can't represent
+ // it so bail out.
+ if (DB.getDemandedBits(I).getBitWidth() > 64)
+ return DenseMap<Instruction*,uint64_t>();
+
+ uint64_t V = DB.getDemandedBits(I).getZExtValue();
+ DBits[Leader] |= V;
+
+ // Casts, loads and instructions outside of our range terminate a chain
+ // successfully.
+ if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) ||
+ !InstructionSet.count(I))
+ continue;
+
+ // Unsafe casts terminate a chain unsuccessfully. We can't do anything
+ // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
+ // transform anything that relies on them.
+ if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) ||
+ !I->getType()->isIntegerTy()) {
+ DBits[Leader] |= ~0ULL;
+ continue;
+ }
+
+ // We don't modify the types of PHIs. Reductions will already have been
+ // truncated if possible, and inductions' sizes will have been chosen by
+ // indvars.
+ if (isa<PHINode>(I))
+ continue;
+
+ if (DBits[Leader] == ~0ULL)
+ // All bits demanded, no point continuing.
+ continue;
+
+ for (Value *O : cast<User>(I)->operands()) {
+ ECs.unionSets(Leader, O);
+ Worklist.push_back(O);
+ }
+ }
+
+ // Now we've discovered all values, walk them to see if there are
+ // any users we didn't see. If there are, we can't optimize that
+ // chain.
+ for (auto &I : DBits)
+ for (auto *U : I.first->users())
+ if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
+ DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
+
+ for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) {
+ uint64_t LeaderDemandedBits = 0;
+ for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
+ LeaderDemandedBits |= DBits[*MI];
+
+ uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) -
+ llvm::countLeadingZeros(LeaderDemandedBits);
+ // Round up to a power of 2
+ if (!isPowerOf2_64((uint64_t)MinBW))
+ MinBW = NextPowerOf2(MinBW);
+ for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) {
+ if (!isa<Instruction>(*MI))
+ continue;
+ Type *Ty = (*MI)->getType();
+ if (Roots.count(*MI))
+ Ty = cast<Instruction>(*MI)->getOperand(0)->getType();
+ if (MinBW < Ty->getScalarSizeInBits())
+ MinBWs[cast<Instruction>(*MI)] = MinBW;
+ }
+ }
+
+ return MinBWs;
+}