aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYingwei Zheng <dtcxzyw2333@gmail.com>2025-09-02 21:41:02 +0800
committerGitHub <noreply@github.com>2025-09-02 21:41:02 +0800
commit89f53af3fffed3e41167fbb7bc10d4885cd97c7f (patch)
tree419fad91cc9717e3f87b108c9e3fc9895273da3e
parent417bdb6672b891000bfa1ec3613074acf03f2616 (diff)
downloadllvm-89f53af3fffed3e41167fbb7bc10d4885cd97c7f.zip
llvm-89f53af3fffed3e41167fbb7bc10d4885cd97c7f.tar.gz
llvm-89f53af3fffed3e41167fbb7bc10d4885cd97c7f.tar.bz2
[ConstraintElim] Use constraints from bounded memory accesses (#155253)
This patch removes bound checks that are dominated by bounded memory accesses. For example, if we have an array `int A[5]` and `A[idx]` is performed successfully, we know that `idx u< 5` after the load. compile-time impact (+0.1%): https://llvm-compile-time-tracker.com/compare.php?from=f0e9bba024d44b55d54b02025623ce4a3ba5a37c&to=5227b08a4a514159ec524d1b1ca18ed8ab5407df&stat=instructions%3Au llvm-opt-benchmark: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2709 Proof: https://alive2.llvm.org/ce/z/JEyjA2
-rw-r--r--llvm/lib/Transforms/Scalar/ConstraintElimination.cpp112
-rw-r--r--llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll373
2 files changed, 478 insertions, 7 deletions
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 1ddb8ae..1b4d8c7 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -19,9 +19,11 @@
#include "llvm/Analysis/ConstraintSystem.h"
#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DebugInfo.h"
@@ -170,10 +172,12 @@ struct State {
DominatorTree &DT;
LoopInfo &LI;
ScalarEvolution &SE;
+ TargetLibraryInfo &TLI;
SmallVector<FactOrCheck, 64> WorkList;
- State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE)
- : DT(DT), LI(LI), SE(SE) {}
+ State(DominatorTree &DT, LoopInfo &LI, ScalarEvolution &SE,
+ TargetLibraryInfo &TLI)
+ : DT(DT), LI(LI), SE(SE), TLI(TLI) {}
/// Process block \p BB and add known facts to work-list.
void addInfoFor(BasicBlock &BB);
@@ -1109,10 +1113,50 @@ void State::addInfoForInductions(BasicBlock &BB) {
}
}
+static bool getConstraintFromMemoryAccess(GetElementPtrInst &GEP,
+ uint64_t AccessSize,
+ CmpPredicate &Pred, Value *&A,
+ Value *&B, const DataLayout &DL,
+ const TargetLibraryInfo &TLI) {
+ auto Offset = collectOffsets(cast<GEPOperator>(GEP), DL);
+ if (!Offset.NW.hasNoUnsignedWrap())
+ return false;
+
+ if (Offset.VariableOffsets.size() != 1)
+ return false;
+
+ ObjectSizeOpts Opts;
+ // Workaround for gep inbounds, ptr null, idx.
+ Opts.NullIsUnknownSize = true;
+ // Be conservative since we are not clear on whether an out of bounds access
+ // to the padding is UB or not.
+ Opts.RoundToAlign = true;
+ std::optional<TypeSize> Size =
+ getBaseObjectSize(Offset.BasePtr, DL, &TLI, Opts);
+ if (!Size || Size->isScalable())
+ return false;
+
+ // Index * Scale + ConstOffset + AccessSize <= AllocSize
+ // With nuw flag, we know that the index addition doesn't have unsigned wrap.
+ // If (AllocSize - (ConstOffset + AccessSize)) wraps around, there is no valid
+ // value for Index.
+ uint64_t BitWidth = Offset.ConstantOffset.getBitWidth();
+ auto &[Index, Scale] = Offset.VariableOffsets.front();
+ APInt MaxIndex = (APInt(BitWidth, Size->getFixedValue() - AccessSize,
+ /*isSigned=*/false, /*implicitTrunc=*/true) -
+ Offset.ConstantOffset)
+ .udiv(Scale);
+ Pred = ICmpInst::ICMP_ULE;
+ A = Index;
+ B = ConstantInt::get(Index->getType(), MaxIndex);
+ return true;
+}
+
void State::addInfoFor(BasicBlock &BB) {
addInfoForInductions(BB);
+ auto &DL = BB.getDataLayout();
- // True as long as long as the current instruction is guaranteed to execute.
+ // True as long as the current instruction is guaranteed to execute.
bool GuaranteedToExecute = true;
// Queue conditions and assumes.
for (Instruction &I : BB) {
@@ -1127,6 +1171,38 @@ void State::addInfoFor(BasicBlock &BB) {
continue;
}
+ auto AddFactFromMemoryAccess = [&](Value *Ptr, Type *AccessType) {
+ auto *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!GEP)
+ return;
+ TypeSize AccessSize = DL.getTypeStoreSize(AccessType);
+ if (!AccessSize.isFixed())
+ return;
+ if (GuaranteedToExecute) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(*GEP, AccessSize.getFixedValue(),
+ Pred, A, B, DL, TLI)) {
+ // The memory access is guaranteed to execute when BB is entered,
+ // hence the constraint holds on entry to BB.
+ WorkList.emplace_back(FactOrCheck::getConditionFact(
+ DT.getNode(I.getParent()), Pred, A, B));
+ }
+ } else {
+ WorkList.emplace_back(
+ FactOrCheck::getInstFact(DT.getNode(I.getParent()), &I));
+ }
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isVolatile())
+ AddFactFromMemoryAccess(LI->getPointerOperand(), LI->getAccessType());
+ }
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isVolatile())
+ AddFactFromMemoryAccess(SI->getPointerOperand(), SI->getAccessType());
+ }
+
auto *II = dyn_cast<IntrinsicInst>(&I);
Intrinsic::ID ID = II ? II->getIntrinsicID() : Intrinsic::not_intrinsic;
switch (ID) {
@@ -1420,7 +1496,7 @@ static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
auto R = Info.getConstraintForSolving(Pred, A, B);
- if (R.empty() || !R.isValid(Info)){
+ if (R.empty() || !R.isValid(Info)) {
LLVM_DEBUG(dbgs() << " failed to decompose condition\n");
return std::nullopt;
}
@@ -1785,12 +1861,13 @@ tryToSimplifyOverflowMath(IntrinsicInst *II, ConstraintInfo &Info,
static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
ScalarEvolution &SE,
- OptimizationRemarkEmitter &ORE) {
+ OptimizationRemarkEmitter &ORE,
+ TargetLibraryInfo &TLI) {
bool Changed = false;
DT.updateDFSNumbers();
SmallVector<Value *> FunctionArgs(llvm::make_pointer_range(F.args()));
ConstraintInfo Info(F.getDataLayout(), FunctionArgs);
- State S(DT, LI, SE);
+ State S(DT, LI, SE, TLI);
std::unique_ptr<Module> ReproducerModule(
DumpReproducers ? new Module(F.getName(), F.getContext()) : nullptr);
@@ -1960,6 +2037,26 @@ static bool eliminateConstraints(Function &F, DominatorTree &DT, LoopInfo &LI,
}
continue;
}
+
+ auto &DL = F.getDataLayout();
+ auto AddFactsAboutIndices = [&](Value *Ptr, Type *AccessType) {
+ CmpPredicate Pred;
+ Value *A, *B;
+ if (getConstraintFromMemoryAccess(
+ *cast<GetElementPtrInst>(Ptr),
+ DL.getTypeStoreSize(AccessType).getFixedValue(), Pred, A, B, DL,
+ TLI))
+ AddFact(Pred, A, B);
+ };
+
+ if (auto *LI = dyn_cast<LoadInst>(CB.Inst)) {
+ AddFactsAboutIndices(LI->getPointerOperand(), LI->getAccessType());
+ continue;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(CB.Inst)) {
+ AddFactsAboutIndices(SI->getPointerOperand(), SI->getAccessType());
+ continue;
+ }
}
Value *A = nullptr, *B = nullptr;
@@ -2018,7 +2115,8 @@ PreservedAnalyses ConstraintEliminationPass::run(Function &F,
auto &LI = AM.getResult<LoopAnalysis>(F);
auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- if (!eliminateConstraints(F, DT, LI, SE, ORE))
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ if (!eliminateConstraints(F, DT, LI, SE, ORE, TLI))
return PreservedAnalyses::all();
PreservedAnalyses PA;
diff --git a/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll b/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll
new file mode 100644
index 0000000..8e3862b
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/implied-by-bounded-memory-access.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=constraint-elimination -S %s | FileCheck %s
+
+@g = private unnamed_addr constant [5 x i8] c"test\00"
+@g_overaligned = private unnamed_addr constant [5 x i8] c"test\00", align 8
+@g_external = external global [5 x i8]
+
+declare void @free(ptr allocptr noundef captures(none)) mustprogress nounwind willreturn allockind("free") memory(argmem: readwrite, inaccessiblemem: readwrite) "alloc-family"="malloc"
+declare ptr @malloc(i64) mustprogress nofree nounwind willreturn allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite) "alloc-family"="malloc"
+declare void @may_not_return(i1)
+
+define i8 @load_global(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_global_const_offset(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_const_offset(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nuw i8, ptr @g, i64 1
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[GEP1]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep1 = getelementptr nuw i8, ptr @g, i64 1
+ %gep = getelementptr nuw i8, ptr %gep1, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 4
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_global_atomic(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_atomic(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load atomic i8, ptr [[GEP]] unordered, align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ %load = load atomic i8, ptr %gep unordered, align 1
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i1 @store_global(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: store i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT: ret i1 true
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ store i8 0, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ ret i1 %cmp
+}
+
+define i1 @store_global_atomic(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global_atomic(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: store atomic i8 0, ptr [[GEP]] release, align 1
+; CHECK-NEXT: ret i1 true
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ store atomic i8 0, ptr %gep release, align 1
+ %cmp = icmp ult i64 %idx, 5
+ ret i1 %cmp
+}
+
+define i8 @load_byval(ptr byval([5 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i8 @load_byval(
+; CHECK-SAME: ptr byval([5 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr %p, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_alloca(i64 %idx) {
+; CHECK-LABEL: define i8 @load_alloca(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ALLOC:%.*]] = alloca [5 x i8], align 1
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOC]], ptr @g, i64 5, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[ALLOC]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %alloc = alloca [5 x i8], align 1
+ call void @llvm.memcpy.p0.p0.i64(ptr %alloc, ptr @g, i64 5, i1 false)
+ %gep = getelementptr nuw i8, ptr %alloc, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_malloc(i64 %idx) {
+; CHECK-LABEL: define i8 @load_malloc(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[ALLOC:%.*]] = call ptr @malloc(i64 5)
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[ALLOC]], ptr @g, i64 5, i1 false)
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[ALLOC]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: call void @free(ptr [[ALLOC]])
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %alloc = call ptr @malloc(i64 5)
+ call void @llvm.memcpy.p0.p0.i64(ptr %alloc, ptr @g, i64 5, i1 false)
+ %gep = getelementptr nuw i8, ptr %alloc, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ call void @free(ptr %alloc)
+ ret i8 %add
+}
+
+define i32 @load_byval_i32(ptr byval([10 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i32 @load_byval_i32(
+; CHECK-SAME: ptr byval([10 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr %p, i64 %idx
+ %load = load i32, ptr %gep
+ %cmp = icmp ult i64 %idx, 7
+ %zext = zext i1 %cmp to i32
+ %add = add i32 %load, %zext
+ ret i32 %add
+}
+
+define i8 @load_global_may_noreturn_dom_bb(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_may_noreturn_dom_bb(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[CMP1:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: call void @may_not_return(i1 [[CMP1]])
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: br label %[[NEXT:.*]]
+; CHECK: [[NEXT]]:
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 true to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ %cmp1 = icmp ult i64 %idx, 5
+ call void @may_not_return(i1 %cmp1) ; %cmp1 should not be simplified.
+ %load = load i8, ptr %gep
+ br label %next
+
+next:
+ %cmp2 = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp2 to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+; Negative tests.
+
+define i8 @load_global_overaligned(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_overaligned(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g_overaligned, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g_overaligned, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_global_external(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_external(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g_external, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g_external, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_from_non_gep(ptr %p, i64 %idx) {
+; CHECK-LABEL: define i8 @load_from_non_gep(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %load = load i8, ptr %p
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_global_multi_indices(i64 %idx1, i64 %idx2) {
+; CHECK-LABEL: define i8 @load_global_multi_indices(
+; CHECK-SAME: i64 [[IDX1:%.*]], i64 [[IDX2:%.*]]) {
+; CHECK-NEXT: [[GEP1:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX1]]
+; CHECK-NEXT: [[GEP2:%.*]] = getelementptr nuw i8, ptr [[GEP1]], i64 [[IDX2]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP2]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX1]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep1 = getelementptr nuw i8, ptr @g, i64 %idx1
+ %gep2 = getelementptr nuw i8, ptr %gep1, i64 %idx2
+ %load = load i8, ptr %gep2
+ %cmp = icmp ult i64 %idx1, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i8 @load_global_without_nuw(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_without_nuw(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr i8, ptr @g, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i32 @load_byval_i32_smaller_range(ptr byval([10 x i8]) %p, i64 %idx) {
+; CHECK-LABEL: define i32 @load_byval_i32_smaller_range(
+; CHECK-SAME: ptr byval([10 x i8]) [[P:%.*]], i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 6
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i32 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr %p, i64 %idx
+ %load = load i32, ptr %gep
+ %cmp = icmp ult i64 %idx, 6
+ %zext = zext i1 %cmp to i32
+ %add = add i32 %load, %zext
+ ret i32 %add
+}
+
+define i8 @load_global_volatile(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_volatile(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load volatile i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ %load = load volatile i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}
+
+define i1 @store_global_volatile(i64 %idx) {
+; CHECK-LABEL: define i1 @store_global_volatile(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: store volatile i8 0, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: ret i1 [[CMP]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ store volatile i8 0, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ ret i1 %cmp
+}
+
+define i8 @load_global_vscale(i64 %idx) {
+; CHECK-LABEL: define i8 @load_global_vscale(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr @g, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load <vscale x 1 x i8>, ptr [[GEP]], align 1
+; CHECK-NEXT: [[EXT:%.*]] = extractelement <vscale x 1 x i8> [[LOAD]], i64 0
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[EXT]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr @g, i64 %idx
+ %load = load <vscale x 1 x i8>, ptr %gep
+ %ext = extractelement <vscale x 1 x i8> %load, i64 0
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %ext, %zext
+ ret i8 %add
+}
+
+define i8 @load_from_null(i64 %idx) {
+; CHECK-LABEL: define i8 @load_from_null(
+; CHECK-SAME: i64 [[IDX:%.*]]) {
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr nuw i8, ptr null, i64 [[IDX]]
+; CHECK-NEXT: [[LOAD:%.*]] = load i8, ptr [[GEP]], align 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IDX]], 5
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i1 [[CMP]] to i8
+; CHECK-NEXT: [[ADD:%.*]] = add i8 [[LOAD]], [[ZEXT]]
+; CHECK-NEXT: ret i8 [[ADD]]
+;
+ %gep = getelementptr nuw i8, ptr null, i64 %idx
+ %load = load i8, ptr %gep
+ %cmp = icmp ult i64 %idx, 5
+ %zext = zext i1 %cmp to i8
+ %add = add i8 %load, %zext
+ ret i8 %add
+}