aboutsummaryrefslogtreecommitdiff
path: root/llvm
diff options
context:
space:
mode:
Diffstat (limited to 'llvm')
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h173
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h511
-rw-r--r--llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h474
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp20
-rw-r--r--llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h2
-rw-r--r--llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp3
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt4
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp370
-rw-r--r--llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp1161
-rw-r--r--llvm/lib/Support/Windows/Signals.inc7
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp28
-rw-r--r--llvm/lib/Target/CSKY/CSKYISelLowering.cpp2
-rw-r--r--llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp156
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h3
-rw-r--r--llvm/lib/Transforms/Vectorize/LoopVectorize.cpp18
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlan.h24
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp7
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp48
-rw-r--r--llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp16
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll199
-rw-r--r--llvm/test/CodeGen/AMDGPU/wait-xcnt.mir176
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll63
-rw-r--r--llvm/test/CodeGen/LoongArch/sink-fold-addi.ll758
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-stackmap.ll108
-rw-r--r--llvm/test/CodeGen/X86/bittest-big-integer.ll6886
-rwxr-xr-xllvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test13
-rw-r--r--llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll26
-rw-r--r--llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll73
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll29
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected57
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected36
-rw-r--r--llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test3
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt1
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml460
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml723
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml460
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml723
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml450
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml870
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml460
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml723
-rw-r--r--llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp764
-rw-r--r--llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp15
-rw-r--r--llvm/utils/UpdateTestChecks/common.py23
46 files changed, 16413 insertions, 778 deletions
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
new file mode 100644
index 0000000..5170893
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h
@@ -0,0 +1,173 @@
+//===- SymbolFilter.h - Utilities for Symbol Filtering ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+#define LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
+
+#include "llvm/ExecutionEngine/Orc/Shared/SimplePackedSerialization.h"
+
+#include <cmath>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+namespace orc {
+
+namespace shared {
+using SPSBloomFilter =
+ SPSTuple<bool, uint32_t, uint32_t, uint32_t, SPSSequence<uint64_t>>;
+}
+
+class BloomFilter {
+public:
+ using HashFunc = std::function<uint32_t(StringRef)>;
+
+ BloomFilter() = default;
+ BloomFilter(BloomFilter &&) noexcept = default;
+ BloomFilter &operator=(BloomFilter &&) noexcept = default;
+ BloomFilter(const BloomFilter &) = delete;
+ BloomFilter &operator=(const BloomFilter &) = delete;
+
+ BloomFilter(uint32_t SymbolCount, float FalsePositiveRate, HashFunc hashFn)
+ : HashFn(std::move(hashFn)) {
+ initialize(SymbolCount, FalsePositiveRate);
+ }
+ bool isInitialized() const { return Initialized; }
+
+ void add(StringRef Sym) {
+ assert(Initialized);
+ addHash(HashFn(Sym));
+ }
+
+ bool mayContain(StringRef Sym) const {
+ return !isEmpty() && testHash(HashFn(Sym));
+ }
+
+ bool isEmpty() const { return SymbolCount == 0; }
+
+private:
+ friend class shared::SPSSerializationTraits<shared::SPSBloomFilter,
+ BloomFilter>;
+ static constexpr uint32_t BitsPerEntry = 64;
+
+ bool Initialized = false;
+ uint32_t SymbolCount = 0;
+ uint32_t BloomSize = 0;
+ uint32_t BloomShift = 0;
+ std::vector<uint64_t> BloomTable;
+ HashFunc HashFn;
+
+ void initialize(uint32_t SymCount, float FalsePositiveRate) {
+ assert(SymCount > 0);
+ SymbolCount = SymCount;
+ Initialized = true;
+
+ float ln2 = std::log(2.0f);
+ float M = -1.0f * SymbolCount * std::log(FalsePositiveRate) / (ln2 * ln2);
+ BloomSize = static_cast<uint32_t>(std::ceil(M / BitsPerEntry));
+ BloomShift = std::min(6u, log2ceil(SymbolCount));
+ BloomTable.resize(BloomSize, 0);
+ }
+
+ void addHash(uint32_t Hash) {
+ uint32_t Hash2 = Hash >> BloomShift;
+ uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+ uint64_t Mask =
+ (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+ BloomTable[N] |= Mask;
+ }
+
+ bool testHash(uint32_t Hash) const {
+ uint32_t Hash2 = Hash >> BloomShift;
+ uint32_t N = (Hash / BitsPerEntry) % BloomSize;
+ uint64_t Mask =
+ (1ULL << (Hash % BitsPerEntry)) | (1ULL << (Hash2 % BitsPerEntry));
+ return (BloomTable[N] & Mask) == Mask;
+ }
+
+ static constexpr uint32_t log2ceil(uint32_t V) {
+ return V <= 1 ? 0 : 32 - countl_zero(V - 1);
+ }
+};
+
+class BloomFilterBuilder {
+public:
+ using HashFunc = BloomFilter::HashFunc;
+
+ BloomFilterBuilder() = default;
+
+ BloomFilterBuilder &setFalsePositiveRate(float Rate) {
+ assert(Rate > 0.0f && Rate < 1.0f);
+ FalsePositiveRate = Rate;
+ return *this;
+ }
+
+ BloomFilterBuilder &setHashFunction(HashFunc Fn) {
+ HashFn = std::move(Fn);
+ return *this;
+ }
+
+ BloomFilter build(ArrayRef<StringRef> Symbols) const {
+ assert(!Symbols.empty() && "Cannot build filter from empty symbol list.");
+ BloomFilter F(static_cast<uint32_t>(Symbols.size()), FalsePositiveRate,
+ HashFn);
+ for (const auto &Sym : Symbols)
+ F.add(Sym);
+
+ return F;
+ }
+
+private:
+ float FalsePositiveRate = 0.02f;
+ HashFunc HashFn = [](StringRef S) -> uint32_t {
+ uint32_t H = 5381;
+ for (char C : S)
+ H = ((H << 5) + H) + static_cast<uint8_t>(C); // H * 33 + C
+ return H;
+ };
+};
+
+namespace shared {
+
+template <> class SPSSerializationTraits<SPSBloomFilter, BloomFilter> {
+public:
+ static size_t size(const BloomFilter &Filter) {
+ return SPSBloomFilter::AsArgList::size(
+ Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+ Filter.BloomShift, Filter.BloomTable);
+ }
+
+ static bool serialize(SPSOutputBuffer &OB, const BloomFilter &Filter) {
+ return SPSBloomFilter::AsArgList::serialize(
+ OB, Filter.Initialized, Filter.SymbolCount, Filter.BloomSize,
+ Filter.BloomShift, Filter.BloomTable);
+ }
+
+ static bool deserialize(SPSInputBuffer &IB, BloomFilter &Filter) {
+ bool IsInitialized;
+ uint32_t SymbolCount = 0, BloomSize = 0, BloomShift = 0;
+ std::vector<uint64_t> BloomTable;
+
+ if (!SPSBloomFilter::AsArgList::deserialize(
+ IB, IsInitialized, SymbolCount, BloomSize, BloomShift, BloomTable))
+ return false;
+
+ Filter.Initialized = IsInitialized;
+ Filter.SymbolCount = SymbolCount;
+ Filter.BloomSize = BloomSize;
+ Filter.BloomShift = BloomShift;
+ Filter.BloomTable = std::move(BloomTable);
+
+ return true;
+ }
+};
+
+} // end namespace shared
+} // end namespace orc
+} // end namespace llvm
+#endif // LLVM_EXECUTIONENGINE_ORC_SHARED_SYMBOLFILTER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
new file mode 100644
index 0000000..9182995
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h
@@ -0,0 +1,511 @@
+//===- LibraryResolver.h - Automatic Library Symbol Resolution -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides support for automatically searching symbols across
+// dynamic libraries that have not yet been loaded.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ExecutionEngine/Orc/Shared/SymbolFilter.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/Support/Path.h"
+
+#include <atomic>
+#include <shared_mutex>
+#include <unordered_map>
+
+namespace llvm {
+namespace orc {
+
+/// Manages library metadata and state for symbol resolution.
+///
+/// Tracks libraries by load state and kind (user/system), and stores
+/// associated Bloom filters and hash maps to speed up symbol lookups.
+/// Thread-safe for concurrent access.
+class LibraryManager {
+public:
+ enum class LibState : uint8_t { Unloaded = 0, Loaded = 1, Queried = 2 };
+
+ class LibraryInfo {
+ public:
+ LibraryInfo(const LibraryInfo &) = delete;
+ LibraryInfo &operator=(const LibraryInfo &) = delete;
+
+ LibraryInfo(std::string FilePath, LibState S, PathType K,
+ std::optional<BloomFilter> Filter = std::nullopt)
+ : FilePath(std::move(FilePath)), S(S), K(K), Filter(std::move(Filter)) {
+ }
+
+ StringRef getBasePath() const { return sys::path::parent_path(FilePath); }
+ StringRef getFileName() const { return sys::path::filename(FilePath); }
+
+ std::string getFullPath() const { return FilePath; }
+
+ void setFilter(BloomFilter F) {
+ std::lock_guard<std::shared_mutex> Lock(Mtx);
+ if (Filter)
+ return;
+ Filter.emplace(std::move(F));
+ }
+
+ void ensureFilterBuilt(const BloomFilterBuilder &FB,
+ ArrayRef<StringRef> Symbols) {
+ std::lock_guard<std::shared_mutex> Lock(Mtx);
+ if (Filter)
+ return;
+ Filter.emplace(FB.build(Symbols));
+ }
+
+ bool mayContain(StringRef Symbol) const {
+ assert(hasFilter());
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ return Filter->mayContain(Symbol);
+ }
+
+ bool hasFilter() const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ return Filter.has_value();
+ }
+
+ LibState getState() const { return S.load(); }
+ PathType getKind() const { return K; }
+
+ void setState(LibState s) { S.store(s); }
+
+ bool operator==(const LibraryInfo &other) const {
+ return FilePath == other.FilePath;
+ }
+
+ private:
+ std::string FilePath;
+ std::atomic<LibState> S;
+ PathType K;
+ std::optional<BloomFilter> Filter;
+ mutable std::shared_mutex Mtx;
+ };
+
+ /// A read-only view of libraries filtered by state and kind.
+ ///
+ /// Lets you loop over only the libraries in a map that match a given State
+ /// and PathType.
+ class FilteredView {
+ public:
+ using Map = StringMap<std::shared_ptr<LibraryInfo>>;
+ using Iterator = typename Map::const_iterator;
+ class FilterIterator {
+ public:
+ FilterIterator(Iterator it_, Iterator end_, LibState S, PathType K)
+ : it(it_), end(end_), S(S), K(K) {
+ advance();
+ }
+
+ bool operator!=(const FilterIterator &other) const {
+ return it != other.it;
+ }
+
+ const std::shared_ptr<LibraryInfo> &operator*() const {
+ return it->second;
+ }
+
+ FilterIterator &operator++() {
+ ++it;
+ advance();
+ return *this;
+ }
+
+ private:
+ void advance() {
+ for (; it != end; ++it)
+ if (it->second->getState() == S && it->second->getKind() == K)
+ break;
+ }
+ Iterator it;
+ Iterator end;
+ LibState S;
+ PathType K;
+ };
+ FilteredView(Iterator begin, Iterator end, LibState s, PathType k)
+ : mapBegin(begin), mapEnd(end), state(s), kind(k) {}
+
+ FilterIterator begin() const {
+ return FilterIterator(mapBegin, mapEnd, state, kind);
+ }
+
+ FilterIterator end() const {
+ return FilterIterator(mapEnd, mapEnd, state, kind);
+ }
+
+ private:
+ Iterator mapBegin;
+ Iterator mapEnd;
+ LibState state;
+ PathType kind;
+ };
+
+private:
+ StringMap<std::shared_ptr<LibraryInfo>> Libraries;
+ mutable std::shared_mutex Mtx;
+
+public:
+ using LibraryVisitor = std::function<bool(const LibraryInfo &)>;
+
+ LibraryManager() = default;
+ ~LibraryManager() = default;
+
+ bool addLibrary(std::string Path, PathType Kind,
+ std::optional<BloomFilter> Filter = std::nullopt) {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (Libraries.count(Path) > 0)
+ return false;
+ Libraries.insert({std::move(Path),
+ std::make_shared<LibraryInfo>(Path, LibState::Unloaded,
+ Kind, std::move(Filter))});
+ return true;
+ }
+
+ bool hasLibrary(StringRef Path) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ if (Libraries.count(Path) > 0)
+ return true;
+ return false;
+ }
+
+ void removeLibrary(StringRef Path) {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ auto I = Libraries.find(Path);
+ if (I == Libraries.end())
+ return;
+ Libraries.erase(I);
+ }
+
+ void markLoaded(StringRef Path) {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (auto It = Libraries.find(Path); It != Libraries.end())
+ It->second->setState(LibState::Loaded);
+ }
+
+ void markQueried(StringRef Path) {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (auto It = Libraries.find(Path); It != Libraries.end())
+ It->second->setState(LibState::Queried);
+ }
+
+ std::shared_ptr<LibraryInfo> getLibrary(StringRef Path) {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ if (auto It = Libraries.find(Path); It != Libraries.end())
+ return It->second;
+ return nullptr;
+ }
+
+ FilteredView getView(LibState S, PathType K) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ return FilteredView(Libraries.begin(), Libraries.end(), S, K);
+ }
+
+ void forEachLibrary(const LibraryVisitor &visitor) const {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ for (const auto &[_, entry] : Libraries) {
+ if (!visitor(*entry))
+ break;
+ }
+ }
+
+ bool isLoaded(StringRef Path) const {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+ return It->second->getState() == LibState::Loaded;
+ return false;
+ }
+
+ bool isQueried(StringRef Path) const {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (auto It = Libraries.find(Path.str()); It != Libraries.end())
+ return It->second->getState() == LibState::Queried;
+ return false;
+ }
+
+ void clear() {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ Libraries.clear();
+ }
+};
+
+using LibraryInfo = LibraryManager::LibraryInfo;
+
+struct SearchPlanEntry {
+ LibraryManager::LibState State; // Loaded, Queried, Unloaded
+ PathType Type; // User, System
+};
+
+struct SearchPolicy {
+ std::vector<SearchPlanEntry> Plan;
+
+ static SearchPolicy defaultPlan() {
+ return {{{LibraryManager::LibState::Loaded, PathType::User},
+ {LibraryManager::LibState::Queried, PathType::User},
+ {LibraryManager::LibState::Unloaded, PathType::User},
+ {LibraryManager::LibState::Loaded, PathType::System},
+ {LibraryManager::LibState::Queried, PathType::System},
+ {LibraryManager::LibState::Unloaded, PathType::System}}};
+ }
+};
+
+struct SymbolEnumeratorOptions {
+ enum Filter : uint32_t {
+ None = 0,
+ IgnoreUndefined = 1 << 0,
+ IgnoreWeak = 1 << 1,
+ IgnoreIndirect = 1 << 2,
+ IgnoreHidden = 1 << 3,
+ IgnoreNonGlobal = 1 << 4
+ };
+
+ static SymbolEnumeratorOptions defaultOptions() {
+ return {Filter::IgnoreUndefined | Filter::IgnoreWeak |
+ Filter::IgnoreIndirect};
+ }
+ uint32_t FilterFlags = Filter::None;
+};
+
+struct SearchConfig {
+ SearchPolicy Policy;
+ SymbolEnumeratorOptions Options;
+
+ SearchConfig()
+ : Policy(SearchPolicy::defaultPlan()), // default plan
+ Options(SymbolEnumeratorOptions::defaultOptions()) {}
+};
+
+/// Scans libraries and resolves Symbols across user and system paths.
+///
+/// Supports symbol enumeration and filtering via SymbolEnumerator, and tracks
+/// symbol resolution results through SymbolQuery. Thread-safe and uses
+/// LibraryScanHelper for efficient path resolution and caching.
+class LibraryResolver {
+ friend class LibraryResolutionDriver;
+
+public:
+ class SymbolEnumerator {
+ public:
+ enum class EnumerateResult { Continue, Stop, Error };
+
+ using OnEachSymbolFn = std::function<EnumerateResult(StringRef Sym)>;
+
+ static bool enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+ const SymbolEnumeratorOptions &Opts);
+ };
+
+ /// Tracks a set of symbols and the libraries where they are resolved.
+ ///
+ /// SymbolQuery is used to keep track of which symbols have been resolved
+ /// to which libraries. It supports concurrent read/write access using a
+ /// shared mutex, allowing multiple readers or a single writer at a time.
+ class SymbolQuery {
+ public:
+ /// Holds the result for a single symbol.
+ struct Result {
+ std::string Name;
+ std::string ResolvedLibPath;
+ };
+
+ private:
+ mutable std::shared_mutex Mtx;
+ StringMap<Result> Results;
+ std::atomic<size_t> ResolvedCount = 0;
+
+ public:
+ explicit SymbolQuery(const std::vector<std::string> &Symbols) {
+ for (const auto &s : Symbols) {
+ if (!Results.contains(s))
+ Results.insert({s, Result{s, ""}});
+ }
+ }
+
+ SmallVector<StringRef> getUnresolvedSymbols() const {
+ SmallVector<StringRef> Unresolved;
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ for (const auto &[name, res] : Results) {
+ if (res.ResolvedLibPath.empty())
+ Unresolved.push_back(name);
+ }
+ return Unresolved;
+ }
+
+ void resolve(StringRef Sym, const std::string &LibPath) {
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ auto It = Results.find(Sym);
+ if (It != Results.end() && It->second.ResolvedLibPath.empty()) {
+ It->second.ResolvedLibPath = LibPath;
+ ResolvedCount.fetch_add(1, std::memory_order_relaxed);
+ }
+ }
+
+ bool allResolved() const {
+ return ResolvedCount.load(std::memory_order_relaxed) == Results.size();
+ }
+
+ bool hasUnresolved() const {
+ return ResolvedCount.load(std::memory_order_relaxed) < Results.size();
+ }
+
+ std::optional<StringRef> getResolvedLib(StringRef Sym) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ auto It = Results.find(Sym);
+ if (It != Results.end() && !It->second.ResolvedLibPath.empty())
+ return StringRef(It->second.ResolvedLibPath);
+ return std::nullopt;
+ }
+
+ bool isResolved(StringRef Sym) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ auto It = Results.find(Sym.str());
+ return It != Results.end() && !It->second.ResolvedLibPath.empty();
+ }
+
+ std::vector<const Result *> getAllResults() const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ std::vector<const Result *> Out;
+ Out.reserve(Results.size());
+ for (const auto &[_, res] : Results)
+ Out.push_back(&res);
+ return Out;
+ }
+ };
+
+ struct Setup {
+ std::vector<std::string> BasePaths;
+ std::shared_ptr<LibraryPathCache> Cache;
+ std::shared_ptr<PathResolver> PResolver;
+
+ size_t ScanBatchSize = 0;
+
+ LibraryScanner::ShouldScanFn ShouldScanCall = [](StringRef) {
+ return true;
+ };
+
+ BloomFilterBuilder FilterBuilder = BloomFilterBuilder();
+
+ static Setup
+ create(std::vector<std::string> BasePaths,
+ std::shared_ptr<LibraryPathCache> existingCache = nullptr,
+ std::shared_ptr<PathResolver> existingResolver = nullptr,
+ LibraryScanner::ShouldScanFn customShouldScan = nullptr) {
+ Setup S;
+ S.BasePaths = std::move(BasePaths);
+
+ S.Cache =
+ existingCache ? existingCache : std::make_shared<LibraryPathCache>();
+
+ S.PResolver = existingResolver ? existingResolver
+ : std::make_shared<PathResolver>(S.Cache);
+
+ if (customShouldScan)
+ S.ShouldScanCall = std::move(customShouldScan);
+
+ return S;
+ }
+ };
+
+ LibraryResolver() = delete;
+ explicit LibraryResolver(const Setup &S);
+ ~LibraryResolver() = default;
+
+ using OnSearchComplete = unique_function<void(SymbolQuery &)>;
+
+ void dump() {
+ int i = 0;
+ LibMgr.forEachLibrary([&](const LibraryInfo &Lib) -> bool {
+ dbgs() << ++i << ". Library Path : " << Lib.getFullPath() << " -> \n\t\t:"
+ << " ({Type : ("
+ << (Lib.getKind() == PathType::User ? "User" : "System")
+ << ") }, { State : "
+ << (Lib.getState() == LibraryManager::LibState::Loaded
+ ? "Loaded"
+ : "Unloaded")
+ << "})\n";
+ return true;
+ });
+ }
+
+ void searchSymbolsInLibraries(std::vector<std::string> &SymList,
+ OnSearchComplete OnComplete,
+ const SearchConfig &Config = SearchConfig());
+
+private:
+ bool scanLibrariesIfNeeded(PathType K, size_t BatchSize = 0);
+ void resolveSymbolsInLibrary(LibraryInfo &Lib, SymbolQuery &Q,
+ const SymbolEnumeratorOptions &Opts);
+ bool
+ symbolExistsInLibrary(const LibraryInfo &Lib, StringRef Sym,
+ std::vector<std::string> *MatchedSymbols = nullptr);
+
+ bool symbolExistsInLibrary(const LibraryInfo &Lib, StringRef SymName,
+ std::vector<std::string> *AllSymbols,
+ const SymbolEnumeratorOptions &Opts);
+
+ std::shared_ptr<LibraryPathCache> LibPathCache;
+ std::shared_ptr<PathResolver> LibPathResolver;
+ LibraryScanHelper ScanHelper;
+ BloomFilterBuilder FB;
+ LibraryManager LibMgr;
+ LibraryScanner::ShouldScanFn ShouldScanCall;
+ size_t scanBatchSize;
+};
+
+using SymbolEnumerator = LibraryResolver::SymbolEnumerator;
+using SymbolQuery = LibraryResolver::SymbolQuery;
+using EnumerateResult = SymbolEnumerator::EnumerateResult;
+
+class LibraryResolutionDriver {
+public:
+ static std::unique_ptr<LibraryResolutionDriver>
+ create(const LibraryResolver::Setup &S);
+
+ void addScanPath(const std::string &Path, PathType Kind);
+ bool markLibraryLoaded(StringRef Path);
+ bool markLibraryUnLoaded(StringRef Path);
+ bool isLibraryLoaded(StringRef Path) const {
+ return LR->LibMgr.isLoaded(Path);
+ }
+
+ void resetAll() {
+ LR->LibMgr.clear();
+ LR->ScanHelper.resetToScan();
+ LR->LibPathCache->clear();
+ }
+
+ void scanAll(size_t BatchSize = 0) {
+ LR->scanLibrariesIfNeeded(PathType::User, BatchSize);
+ LR->scanLibrariesIfNeeded(PathType::System, BatchSize);
+ }
+
+ void scan(PathType PK, size_t BatchSize = 0) {
+ LR->scanLibrariesIfNeeded(PK, BatchSize);
+ }
+
+ void resolveSymbols(std::vector<std::string> Symbols,
+ LibraryResolver::OnSearchComplete OnCompletion,
+ const SearchConfig &Config = SearchConfig());
+
+ ~LibraryResolutionDriver() = default;
+
+private:
+ LibraryResolutionDriver(std::unique_ptr<LibraryResolver> L)
+ : LR(std::move(L)) {}
+
+ std::unique_ptr<LibraryResolver> LR;
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYRESOLVER_H
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
new file mode 100644
index 0000000..d1c2013
--- /dev/null
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h
@@ -0,0 +1,474 @@
+//===- LibraryScanner.h - Scanner for Shared Libraries ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides functionality for scanning dynamic (shared) libraries.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+#define LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
+
+#include "llvm/ADT/FunctionExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/StringSaver.h"
+
+#include <atomic>
+#include <mutex>
+#include <queue>
+#include <shared_mutex>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace llvm {
+namespace orc {
+
+class LibraryManager;
+
+class LibraryPathCache {
+ friend class PathResolver;
+
+public:
+ LibraryPathCache() = default;
+
+ void clear(bool isRealPathCache = false) {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ Seen.clear();
+ if (isRealPathCache) {
+ RealPathCache.clear();
+#ifndef _WIN32
+ ReadlinkCache.clear();
+ LstatCache.clear();
+#endif
+ }
+ }
+
+ void markSeen(const std::string &CanonPath) {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ Seen.insert(CanonPath);
+ }
+
+ bool hasSeen(StringRef CanonPath) const {
+ std::shared_lock<std::shared_mutex> lock(Mtx);
+ return Seen.contains(CanonPath);
+ }
+
+ bool hasSeenOrMark(StringRef CanonPath) {
+ std::string s = CanonPath.str();
+ {
+ std::shared_lock<std::shared_mutex> lock(Mtx);
+ if (Seen.contains(s))
+ return true;
+ }
+ {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ Seen.insert(s);
+ }
+ return false;
+ }
+
+private:
+ mutable std::shared_mutex Mtx;
+
+ struct PathInfo {
+ std::string canonicalPath;
+ std::error_code ErrnoCode;
+ };
+
+ void insert_realpath(StringRef Path, const PathInfo &Info) {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ RealPathCache.insert({Path, Info});
+ }
+
+ std::optional<PathInfo> read_realpath(StringRef Path) const {
+ std::shared_lock<std::shared_mutex> lock(Mtx);
+ auto It = RealPathCache.find(Path);
+ if (It != RealPathCache.end())
+ return It->second;
+
+ return std::nullopt;
+ }
+
+ StringSet<> Seen;
+ StringMap<PathInfo> RealPathCache;
+
+#ifndef _WIN32
+ StringMap<std::string> ReadlinkCache;
+ StringMap<mode_t> LstatCache;
+
+ void insert_link(StringRef Path, const std::string &s) {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ ReadlinkCache.insert({Path, s});
+ }
+
+ std::optional<std::string> read_link(StringRef Path) const {
+ std::shared_lock<std::shared_mutex> lock(Mtx);
+ auto It = ReadlinkCache.find(Path);
+ if (It != ReadlinkCache.end())
+ return It->second;
+
+ return std::nullopt;
+ }
+
+ void insert_lstat(StringRef Path, mode_t m) {
+ std::unique_lock<std::shared_mutex> lock(Mtx);
+ LstatCache.insert({Path, m});
+ }
+
+ std::optional<mode_t> read_lstat(StringRef Path) const {
+ std::shared_lock<std::shared_mutex> lock(Mtx);
+ auto It = LstatCache.find(Path);
+ if (It != LstatCache.end())
+ return It->second;
+
+ return std::nullopt;
+ }
+
+#endif
+};
+
+/// Resolves file system paths with optional caching of results.
+///
+/// Supports lstat, readlink, and realpath operations. Can resolve paths
+/// relative to a base and handle symbolic links. Caches results to reduce
+/// repeated system calls when enabled.
+class PathResolver {
+private:
+ std::shared_ptr<LibraryPathCache> LibPathCache;
+
+public:
+ PathResolver(std::shared_ptr<LibraryPathCache> cache)
+ : LibPathCache(std::move(cache)) {}
+
+ std::optional<std::string> resolve(StringRef Path, std::error_code &ec) {
+ return realpathCached(Path, ec);
+ }
+#ifndef _WIN32
+ mode_t lstatCached(StringRef Path);
+ std::optional<std::string> readlinkCached(StringRef Path);
+#endif
+ std::optional<std::string> realpathCached(StringRef Path, std::error_code &ec,
+ StringRef base = "",
+ bool baseIsResolved = false,
+ long symloopLevel = 40);
+};
+
+/// Performs placeholder substitution in dynamic library paths.
+///
+/// Configures known placeholders (like @loader_path) and replaces them
+/// in input paths with their resolved values.
+class DylibSubstitutor {
+public:
+ void configure(StringRef loaderPath);
+
+ std::string substitute(StringRef input) const {
+ for (const auto &[ph, value] : Placeholders) {
+ if (input.starts_with_insensitive(ph))
+ return (Twine(value) + input.drop_front(ph.size())).str();
+ }
+ return input.str();
+ }
+
+private:
+ StringMap<std::string> Placeholders;
+};
+
+/// Validates and normalizes dynamic library paths.
+///
+/// Uses a `PathResolver` to resolve paths to their canonical form and
+/// checks whether they point to valid shared libraries.
+class DylibPathValidator {
+public:
+ DylibPathValidator(PathResolver &PR) : LibPathResolver(PR) {}
+
+ static bool isSharedLibrary(StringRef Path);
+
+ std::optional<std::string> normalize(StringRef Path) const {
+ std::error_code ec;
+ auto real = LibPathResolver.resolve(Path, ec);
+ if (!real || ec)
+ return std::nullopt;
+
+ return real;
+ }
+
+ /// Validate the given path as a shared library.
+ std::optional<std::string> validate(StringRef Path) const {
+ auto realOpt = normalize(Path);
+ if (!realOpt)
+ return std::nullopt;
+
+ if (!isSharedLibrary(*realOpt))
+ return std::nullopt;
+
+ return realOpt;
+ }
+
+private:
+ PathResolver &LibPathResolver;
+};
+
+enum class SearchPathType {
+ RPath,
+ UsrOrSys,
+ RunPath,
+};
+
+struct SearchPathConfig {
+ ArrayRef<StringRef> Paths;
+ SearchPathType type;
+};
+
+class SearchPathResolver {
+public:
+ SearchPathResolver(const SearchPathConfig &Cfg,
+ StringRef PlaceholderPrefix = "")
+ : Kind(Cfg.type), PlaceholderPrefix(PlaceholderPrefix) {
+ for (auto &path : Cfg.Paths)
+ Paths.emplace_back(path.str());
+ }
+
+ std::optional<std::string> resolve(StringRef libStem,
+ const DylibSubstitutor &Subst,
+ DylibPathValidator &Validator) const;
+ SearchPathType searchPathType() const { return Kind; }
+
+private:
+ std::vector<std::string> Paths;
+ SearchPathType Kind;
+ std::string PlaceholderPrefix;
+};
+
+class DylibResolverImpl {
+public:
+ DylibResolverImpl(DylibSubstitutor Substitutor, DylibPathValidator &Validator,
+ std::vector<SearchPathResolver> Resolvers)
+ : Substitutor(std::move(Substitutor)), Validator(Validator),
+ Resolvers(std::move(Resolvers)) {}
+
+ std::optional<std::string> resolve(StringRef Stem,
+ bool VariateLibStem = false) const;
+
+private:
+ std::optional<std::string> tryWithExtensions(StringRef libstem) const;
+
+ DylibSubstitutor Substitutor;
+ DylibPathValidator &Validator;
+ std::vector<SearchPathResolver> Resolvers;
+};
+
+class DylibResolver {
+public:
+ DylibResolver(DylibPathValidator &Validator) : Validator(Validator) {}
+
+ void configure(StringRef loaderPath,
+ ArrayRef<SearchPathConfig> SearchPathCfg) {
+ DylibSubstitutor Substitutor;
+ Substitutor.configure(loaderPath);
+
+ std::vector<SearchPathResolver> Resolvers;
+ for (const auto &cfg : SearchPathCfg) {
+ Resolvers.emplace_back(cfg,
+ cfg.type == SearchPathType::RPath ? "@rpath" : "");
+ }
+
+ impl_ = std::make_unique<DylibResolverImpl>(
+ std::move(Substitutor), Validator, std::move(Resolvers));
+ }
+
+ std::optional<std::string> resolve(StringRef libStem,
+ bool VariateLibStem = false) const {
+ if (!impl_)
+ return std::nullopt;
+ return impl_->resolve(libStem, VariateLibStem);
+ }
+
+ static std::string resolvelinkerFlag(StringRef libStem,
+ StringRef loaderPath) {
+ DylibSubstitutor Substitutor;
+ Substitutor.configure(loaderPath);
+ return Substitutor.substitute(libStem);
+ }
+
+private:
+ DylibPathValidator &Validator;
+ std::unique_ptr<DylibResolverImpl> impl_;
+};
+
+enum class PathType : uint8_t { User, System, Unknown };
+
+enum class ScanState : uint8_t { NotScanned, Scanning, Scanned };
+
+struct LibrarySearchPath {
+ std::string BasePath; // Canonical base directory path
+ PathType Kind; // User or System
+ std::atomic<ScanState> State;
+
+ LibrarySearchPath(std::string Base, PathType K)
+ : BasePath(std::move(Base)), Kind(K), State(ScanState::NotScanned) {}
+};
+
+/// Scans and tracks libraries for symbol resolution.
+///
+/// Maintains a list of library paths to scan, caches scanned units,
+/// and resolves paths canonically for consistent tracking.
+class LibraryScanHelper {
+public:
+ explicit LibraryScanHelper(const std::vector<std::string> &SPaths,
+ std::shared_ptr<LibraryPathCache> LibPathCache,
+ std::shared_ptr<PathResolver> LibPathResolver)
+ : LibPathCache(std::move(LibPathCache)),
+ LibPathResolver(std::move(LibPathResolver)) {
+ DEBUG_WITH_TYPE(
+ "orc", dbgs() << "LibraryScanHelper::LibraryScanHelper: base paths : "
+ << SPaths.size() << "\n";);
+ for (const auto &p : SPaths)
+ addBasePath(p);
+ }
+
+ void
+ addBasePath(const std::string &P,
+ PathType Kind =
+ PathType::Unknown); // Add a canonical directory for scanning
+ std::vector<std::shared_ptr<LibrarySearchPath>>
+ getNextBatch(PathType Kind, size_t batchSize);
+
+ bool leftToScan(PathType K) const;
+ void resetToScan();
+
+ bool isTrackedBasePath(StringRef P) const;
+ std::vector<std::shared_ptr<LibrarySearchPath>> getAllUnits() const;
+
+ SmallVector<StringRef> getSearchPaths() const {
+ SmallVector<StringRef> SearchPaths;
+ for (const auto &[_, SP] : LibSearchPaths)
+ SearchPaths.push_back(SP->BasePath);
+ return SearchPaths;
+ }
+
+ PathResolver &getPathResolver() const { return *LibPathResolver; }
+
+ LibraryPathCache &getCache() const { return *LibPathCache; }
+
+ bool hasSeenOrMark(StringRef P) const {
+ return LibPathCache->hasSeenOrMark(P);
+ }
+
+ std::optional<std::string> resolve(StringRef P, std::error_code &ec) const {
+ return LibPathResolver->resolve(P.str(), ec);
+ }
+
+private:
+ std::string resolveCanonical(StringRef P, std::error_code &ec) const;
+ PathType classifyKind(StringRef P) const;
+
+ mutable std::shared_mutex Mtx;
+ std::shared_ptr<LibraryPathCache> LibPathCache;
+ std::shared_ptr<PathResolver> LibPathResolver;
+
+ StringMap<std::shared_ptr<LibrarySearchPath>>
+ LibSearchPaths; // key: canonical path
+ std::deque<StringRef> UnscannedUsr;
+ std::deque<StringRef> UnscannedSys;
+};
+
+/// Loads an object file and provides access to it.
+///
+/// Owns the underlying `ObjectFile` and ensures it is valid.
+/// Any errors encountered during construction are stored and
+/// returned when attempting to access the file.
+class ObjectFileLoader {
+public:
+ /// Construct an object file loader from the given path.
+ explicit ObjectFileLoader(StringRef Path) {
+ auto ObjOrErr = loadObjectFileWithOwnership(Path);
+ if (ObjOrErr)
+ Obj = std::move(*ObjOrErr);
+ else {
+ consumeError(std::move(Err));
+ Err = ObjOrErr.takeError();
+ }
+ }
+
+ ObjectFileLoader(const ObjectFileLoader &) = delete;
+ ObjectFileLoader &operator=(const ObjectFileLoader &) = delete;
+
+ ObjectFileLoader(ObjectFileLoader &&) = default;
+ ObjectFileLoader &operator=(ObjectFileLoader &&) = default;
+
+ /// Get the loaded object file, or return an error if loading failed.
+ Expected<object::ObjectFile &> getObjectFile() {
+ if (Err)
+ return std::move(Err);
+ return *Obj.getBinary();
+ }
+
+ static bool isArchitectureCompatible(const object::ObjectFile &Obj);
+
+private:
+ object::OwningBinary<object::ObjectFile> Obj;
+ Error Err = Error::success();
+
+ static Expected<object::OwningBinary<object::ObjectFile>>
+ loadObjectFileWithOwnership(StringRef FilePath);
+};
+
+/// Scans libraries, resolves dependencies, and registers them.
+class LibraryScanner {
+public:
+ using ShouldScanFn = std::function<bool(StringRef)>;
+
+ LibraryScanner(
+ LibraryScanHelper &H, LibraryManager &LibMgr,
+ ShouldScanFn ShouldScanCall = [](StringRef path) { return true; })
+ : ScanHelper(H), LibMgr(LibMgr),
+ ShouldScanCall(std::move(ShouldScanCall)) {}
+
+ void scanNext(PathType Kind, size_t batchSize = 1);
+
+ /// Dependency info for a library.
+ struct LibraryDepsInfo {
+ llvm::BumpPtrAllocator Alloc;
+ llvm::StringSaver Saver{Alloc};
+
+ SmallVector<StringRef, 2> rpath;
+ SmallVector<StringRef, 2> runPath;
+ SmallVector<StringRef, 4> deps;
+ bool isPIE = false;
+
+ void addRPath(StringRef s) { rpath.push_back(Saver.save(s)); }
+
+ void addRunPath(StringRef s) { runPath.push_back(Saver.save(s)); }
+
+ void addDep(StringRef s) { deps.push_back(Saver.save(s)); }
+ };
+
+private:
+ LibraryScanHelper &ScanHelper;
+ LibraryManager &LibMgr;
+ ShouldScanFn ShouldScanCall;
+
+ std::optional<std::string> shouldScan(StringRef FilePath);
+ Expected<LibraryDepsInfo> extractDeps(StringRef FilePath);
+
+ void handleLibrary(StringRef P, PathType K, int level = 1);
+
+ void scanBaseDir(std::shared_ptr<LibrarySearchPath> U);
+};
+
+using LibraryDepsInfo = LibraryScanner::LibraryDepsInfo;
+
+} // end namespace orc
+} // end namespace llvm
+
+#endif // LLVM_EXECUTIONENGINE_ORC_TARGETPROCESS_LIBRARYSCANNER_H
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index bf1abfe..58983cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -1172,6 +1172,12 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
case ISD::FAKE_USE:
Res = SoftenFloatOp_FAKE_USE(N);
break;
+ case ISD::STACKMAP:
+ Res = SoftenFloatOp_STACKMAP(N, OpNo);
+ break;
+ case ISD::PATCHPOINT:
+ Res = SoftenFloatOp_PATCHPOINT(N, OpNo);
+ break;
}
// If the result is null, the sub-method took care of registering results etc.
@@ -1512,6 +1518,20 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FAKE_USE(SDNode *N) {
N->getOperand(0), Op1);
}
+SDValue DAGTypeLegalizer::SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo) {
+ assert(OpNo > 1); // Because the first two arguments are guaranteed legal.
+ SmallVector<SDValue> NewOps(N->ops());
+ NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo) {
+ assert(OpNo >= 7);
+ SmallVector<SDValue> NewOps(N->ops());
+ NewOps[OpNo] = GetSoftenedFloat(NewOps[OpNo]);
+ return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
//===----------------------------------------------------------------------===//
// Float Result Expansion
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 9656a30..ede522e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -658,6 +658,8 @@ private:
SDValue SoftenFloatOp_ATOMIC_STORE(SDNode *N, unsigned OpNo);
SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
SDValue SoftenFloatOp_FAKE_USE(SDNode *N);
+ SDValue SoftenFloatOp_STACKMAP(SDNode *N, unsigned OpNo);
+ SDValue SoftenFloatOp_PATCHPOINT(SDNode *N, unsigned OpNo);
//===--------------------------------------------------------------------===//
// Float Expansion Support: LegalizeFloatTypes.cpp
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 6c23ba8..23ab534 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -102,7 +102,8 @@ std::optional<CVType> LazyRandomTypeCollection::tryGetType(TypeIndex Index) {
return std::nullopt;
}
- assert(contains(Index));
+ if (!contains(Index))
+ return std::nullopt;
return Records[Index.toArrayIndex()].Type;
}
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
index 9275586..ca8192b 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/CMakeLists.txt
@@ -16,9 +16,11 @@ add_llvm_component_library(LLVMOrcTargetProcess
ExecutorSharedMemoryMapperService.cpp
DefaultHostBootstrapValues.cpp
ExecutorResolver.cpp
+ LibraryResolver.cpp
JITLoaderGDB.cpp
JITLoaderPerf.cpp
JITLoaderVTune.cpp
+ LibraryScanner.cpp
OrcRTBootstrap.cpp
RegisterEHFrames.cpp
SimpleExecutorDylibManager.cpp
@@ -36,6 +38,8 @@ add_llvm_component_library(LLVMOrcTargetProcess
LINK_COMPONENTS
${intel_jit_profiling}
+ BinaryFormat
+ Object
OrcShared
Support
TargetParser
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
new file mode 100644
index 0000000..35da82a
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryResolver.cpp
@@ -0,0 +1,370 @@
+//===- LibraryResolver.cpp - Library Resolution of Unresolved Symbols ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Library resolution impl for unresolved symbols
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+
+#include "llvm/ADT/StringSet.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+
+#include <mutex>
+#include <thread>
+
+#define DEBUG_TYPE "orc-resolver"
+
+namespace llvm::orc {
+
+LibraryResolver::LibraryResolver(const LibraryResolver::Setup &S)
+ : LibPathCache(S.Cache ? S.Cache : std::make_shared<LibraryPathCache>()),
+ LibPathResolver(S.PResolver
+ ? S.PResolver
+ : std::make_shared<PathResolver>(LibPathCache)),
+ ScanHelper(S.BasePaths, LibPathCache, LibPathResolver),
+ FB(S.FilterBuilder), LibMgr(),
+ ShouldScanCall(S.ShouldScanCall ? S.ShouldScanCall
+ : [](StringRef) -> bool { return true; }),
+ scanBatchSize(S.ScanBatchSize) {
+
+ if (ScanHelper.getAllUnits().empty()) {
+ LLVM_DEBUG(dbgs() << "Warning: No base paths provided for scanning.\n");
+ }
+}
+
+std::unique_ptr<LibraryResolutionDriver>
+LibraryResolutionDriver::create(const LibraryResolver::Setup &S) {
+ auto LR = std::make_unique<LibraryResolver>(S);
+ return std::unique_ptr<LibraryResolutionDriver>(
+ new LibraryResolutionDriver(std::move(LR)));
+}
+
+void LibraryResolutionDriver::addScanPath(const std::string &Path, PathType K) {
+ LR->ScanHelper.addBasePath(Path, K);
+}
+
+bool LibraryResolutionDriver::markLibraryLoaded(StringRef Path) {
+ auto Lib = LR->LibMgr.getLibrary(Path);
+ if (!Lib)
+ return false;
+
+ Lib->setState(LibraryManager::LibState::Loaded);
+
+ return true;
+}
+
+bool LibraryResolutionDriver::markLibraryUnLoaded(StringRef Path) {
+ auto Lib = LR->LibMgr.getLibrary(Path);
+ if (!Lib)
+ return false;
+
+ Lib->setState(LibraryManager::LibState::Unloaded);
+
+ return true;
+}
+
+void LibraryResolutionDriver::resolveSymbols(
+ std::vector<std::string> Syms,
+ LibraryResolver::OnSearchComplete OnCompletion,
+ const SearchConfig &Config) {
+ LR->searchSymbolsInLibraries(Syms, std::move(OnCompletion), Config);
+}
+
+static bool shouldIgnoreSymbol(const object::SymbolRef &Sym,
+ uint32_t IgnoreFlags) {
+ Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+ if (!FlagsOrErr) {
+ consumeError(FlagsOrErr.takeError());
+ return true;
+ }
+
+ uint32_t Flags = *FlagsOrErr;
+
+ using Filter = SymbolEnumeratorOptions;
+ if ((IgnoreFlags & Filter::IgnoreUndefined) &&
+ (Flags & object::SymbolRef::SF_Undefined))
+ return true;
+ if ((IgnoreFlags & Filter::IgnoreIndirect) &&
+ (Flags & object::SymbolRef::SF_Indirect))
+ return true;
+ if ((IgnoreFlags & Filter::IgnoreWeak) &&
+ (Flags & object::SymbolRef::SF_Weak))
+ return true;
+
+ return false;
+}
+
+bool SymbolEnumerator::enumerateSymbols(StringRef Path, OnEachSymbolFn OnEach,
+ const SymbolEnumeratorOptions &Opts) {
+ if (Path.empty())
+ return false;
+
+ ObjectFileLoader ObjLoader(Path);
+
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ std::string ErrMsg;
+ handleAllErrors(ObjOrErr.takeError(),
+ [&](const ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+ LLVM_DEBUG(dbgs() << "Failed loading object file: " << Path
+ << "\nError: " << ErrMsg << "\n");
+ return false;
+ }
+
+ object::ObjectFile *Obj = &ObjOrErr.get();
+
+ auto processSymbolRange =
+ [&](object::ObjectFile::symbol_iterator_range Range) -> EnumerateResult {
+ for (const auto &Sym : Range) {
+ if (shouldIgnoreSymbol(Sym, Opts.FilterFlags))
+ continue;
+
+ auto NameOrErr = Sym.getName();
+ if (!NameOrErr) {
+ consumeError(NameOrErr.takeError());
+ continue;
+ }
+
+ StringRef Name = *NameOrErr;
+ if (Name.empty())
+ continue;
+
+ EnumerateResult Res = OnEach(Name);
+ if (Res != EnumerateResult::Continue)
+ return Res;
+ }
+ return EnumerateResult::Continue;
+ };
+
+ EnumerateResult Res = processSymbolRange(Obj->symbols());
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+
+ if (Obj->isELF()) {
+ const auto *ElfObj = cast<object::ELFObjectFileBase>(Obj);
+ Res = processSymbolRange(ElfObj->getDynamicSymbolIterators());
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+ } else if (Obj->isCOFF()) {
+ const auto *CoffObj = cast<object::COFFObjectFile>(Obj);
+ for (auto I = CoffObj->export_directory_begin(),
+ E = CoffObj->export_directory_end();
+ I != E; ++I) {
+ StringRef Name;
+ if (I->getSymbolName(Name))
+ continue;
+ if (Name.empty())
+ continue;
+
+ EnumerateResult Res = OnEach(Name);
+ if (Res != EnumerateResult::Continue)
+ return Res == EnumerateResult::Stop;
+ }
+ } else if (Obj->isMachO()) {
+ }
+
+ return true;
+}
+
+class SymbolSearchContext {
+public:
+ SymbolSearchContext(SymbolQuery &Q) : Q(Q) {}
+
+ bool hasSearched(LibraryInfo *Lib) const { return Searched.count(Lib); }
+
+ void markSearched(LibraryInfo *Lib) { Searched.insert(Lib); }
+
+ inline bool allResolved() const { return Q.allResolved(); }
+
+ SymbolQuery &query() { return Q; }
+
+private:
+ SymbolQuery &Q;
+ DenseSet<LibraryInfo *> Searched;
+};
+
+void LibraryResolver::resolveSymbolsInLibrary(
+ LibraryInfo &Lib, SymbolQuery &UnresolvedSymbols,
+ const SymbolEnumeratorOptions &Opts) {
+ LLVM_DEBUG(dbgs() << "Checking unresolved symbols "
+ << " in library : " << Lib.getFileName() << "\n";);
+ StringSet<> DiscoveredSymbols;
+
+ if (!UnresolvedSymbols.hasUnresolved()) {
+ LLVM_DEBUG(dbgs() << "Skipping library: " << Lib.getFullPath()
+ << " — unresolved symbols exist.\n";);
+ return;
+ }
+
+ bool HasEnumerated = false;
+ auto enumerateSymbolsIfNeeded = [&]() {
+ if (HasEnumerated)
+ return;
+
+ HasEnumerated = true;
+
+ LLVM_DEBUG(dbgs() << "Enumerating symbols in library: " << Lib.getFullPath()
+ << "\n";);
+ SymbolEnumerator::enumerateSymbols(
+ Lib.getFullPath(),
+ [&](StringRef sym) {
+ DiscoveredSymbols.insert(sym);
+ return EnumerateResult::Continue;
+ },
+ Opts);
+
+ if (DiscoveredSymbols.empty()) {
+ LLVM_DEBUG(dbgs() << " No symbols and remove library : "
+ << Lib.getFullPath() << "\n";);
+ LibMgr.removeLibrary(Lib.getFullPath());
+ return;
+ }
+ };
+
+ if (!Lib.hasFilter()) {
+ LLVM_DEBUG(dbgs() << "Building filter for library: " << Lib.getFullPath()
+ << "\n";);
+ enumerateSymbolsIfNeeded();
+ SmallVector<StringRef> SymbolVec;
+ SymbolVec.reserve(DiscoveredSymbols.size());
+ for (const auto &KV : DiscoveredSymbols)
+ SymbolVec.push_back(KV.first());
+
+ Lib.ensureFilterBuilt(FB, SymbolVec);
+ LLVM_DEBUG({
+ dbgs() << "DiscoveredSymbols : " << DiscoveredSymbols.size() << "\n";
+ for (const auto &KV : DiscoveredSymbols)
+ dbgs() << "DiscoveredSymbols : " << KV.first() << "\n";
+ });
+ }
+
+ const auto &Unresolved = UnresolvedSymbols.getUnresolvedSymbols();
+ bool HadAnySym = false;
+ LLVM_DEBUG(dbgs() << "Total unresolved symbols : " << Unresolved.size()
+ << "\n";);
+ for (const auto &Sym : Unresolved) {
+ if (Lib.mayContain(Sym)) {
+ LLVM_DEBUG(dbgs() << "Checking symbol '" << Sym
+ << "' in library: " << Lib.getFullPath() << "\n";);
+ enumerateSymbolsIfNeeded();
+ if (DiscoveredSymbols.count(Sym) > 0) {
+ LLVM_DEBUG(dbgs() << " Resolved symbol: " << Sym
+ << " in library: " << Lib.getFullPath() << "\n";);
+ UnresolvedSymbols.resolve(Sym, Lib.getFullPath());
+ HadAnySym = true;
+ }
+ }
+ }
+
+ using LibraryState = LibraryManager::LibState;
+ if (HadAnySym && Lib.getState() != LibraryState::Loaded)
+ Lib.setState(LibraryState::Queried);
+}
+
+void LibraryResolver::searchSymbolsInLibraries(
+ std::vector<std::string> &SymbolList, OnSearchComplete OnComplete,
+ const SearchConfig &Config) {
+ SymbolQuery Q(SymbolList);
+
+ using LibraryState = LibraryManager::LibState;
+ using LibraryType = PathType;
+ auto tryResolveFrom = [&](LibraryState S, LibraryType K) {
+ LLVM_DEBUG(dbgs() << "Trying resolve from state=" << static_cast<int>(S)
+ << " type=" << static_cast<int>(K) << "\n";);
+
+ SymbolSearchContext Ctx(Q);
+ while (!Ctx.allResolved()) {
+
+ for (auto &Lib : LibMgr.getView(S, K)) {
+ if (Ctx.hasSearched(Lib.get()))
+ continue;
+
+ // can use Async here?
+ resolveSymbolsInLibrary(*Lib, Ctx.query(), Config.Options);
+ Ctx.markSearched(Lib.get());
+
+ if (Ctx.allResolved())
+ return;
+ }
+
+ if (Ctx.allResolved())
+ return;
+
+ if (!scanLibrariesIfNeeded(K, scanBatchSize))
+ break; // no more new libs to scan
+ }
+ };
+
+ for (const auto &[St, Ty] : Config.Policy.Plan) {
+ tryResolveFrom(St, Ty);
+ if (Q.allResolved())
+ break;
+ }
+
+ // done:
+ LLVM_DEBUG({
+ dbgs() << "Search complete.\n";
+ for (const auto &r : Q.getAllResults())
+ dbgs() << "Resolved Symbol:" << r->Name << " -> " << r->ResolvedLibPath
+ << "\n";
+ });
+
+ OnComplete(Q);
+}
+
+bool LibraryResolver::scanLibrariesIfNeeded(PathType PK, size_t BatchSize) {
+ LLVM_DEBUG(dbgs() << "LibraryResolver::scanLibrariesIfNeeded: Scanning for "
+ << (PK == PathType::User ? "User" : "System")
+ << " libraries\n";);
+ if (!ScanHelper.leftToScan(PK))
+ return false;
+
+ LibraryScanner Scanner(ScanHelper, LibMgr, ShouldScanCall);
+ Scanner.scanNext(PK, BatchSize);
+ return true;
+}
+
+bool LibraryResolver::symbolExistsInLibrary(const LibraryInfo &Lib,
+ StringRef SymName,
+ std::vector<std::string> *AllSyms) {
+ SymbolEnumeratorOptions Opts;
+ return symbolExistsInLibrary(Lib, SymName, AllSyms, Opts);
+}
+
+bool LibraryResolver::symbolExistsInLibrary(
+ const LibraryInfo &Lib, StringRef SymName,
+ std::vector<std::string> *AllSyms, const SymbolEnumeratorOptions &Opts) {
+ bool Found = false;
+
+ SymbolEnumerator::enumerateSymbols(
+ Lib.getFullPath(),
+ [&](StringRef Sym) {
+ if (AllSyms)
+ AllSyms->emplace_back(Sym.str());
+
+ if (Sym == SymName) {
+ Found = true;
+ }
+
+ return EnumerateResult::Continue;
+ },
+ Opts);
+
+ return Found;
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
new file mode 100644
index 0000000..d93f686
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/LibraryScanner.cpp
@@ -0,0 +1,1161 @@
+//===- LibraryScanner.cpp - Provide Library Scanning Implementation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/ELFTypes.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Program.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/Triple.h"
+
+#ifdef LLVM_ON_UNIX
+#include <sys/stat.h>
+#include <unistd.h>
+#endif // LLVM_ON_UNIX
+
+#ifdef __APPLE__
+#include <sys/stat.h>
+#undef LC_LOAD_DYLIB
+#undef LC_RPATH
+#endif // __APPLE__
+
+#define DEBUG_TYPE "orc-scanner"
+
+namespace llvm::orc {
+
+void handleError(Error Err, StringRef context = "") {
+ consumeError(handleErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+ dbgs() << "LLVM Error";
+ if (!context.empty())
+ dbgs() << " [" << context << "]";
+ dbgs() << ": " << EIB.message() << "\n";
+ }));
+}
+
+bool ObjectFileLoader::isArchitectureCompatible(const object::ObjectFile &Obj) {
+ Triple HostTriple(sys::getDefaultTargetTriple());
+ Triple ObjTriple = Obj.makeTriple();
+
+ LLVM_DEBUG({
+ dbgs() << "Host triple: " << HostTriple.str()
+ << ", Object triple: " << ObjTriple.str() << "\n";
+ });
+
+ if (ObjTriple.getArch() != Triple::UnknownArch &&
+ HostTriple.getArch() != ObjTriple.getArch())
+ return false;
+
+ if (ObjTriple.getOS() != Triple::UnknownOS &&
+ HostTriple.getOS() != ObjTriple.getOS())
+ return false;
+
+ if (ObjTriple.getEnvironment() != Triple::UnknownEnvironment &&
+ HostTriple.getEnvironment() != Triple::UnknownEnvironment &&
+ HostTriple.getEnvironment() != ObjTriple.getEnvironment())
+ return false;
+
+ return true;
+}
+
+Expected<object::OwningBinary<object::ObjectFile>>
+ObjectFileLoader::loadObjectFileWithOwnership(StringRef FilePath) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Attempting to open file " << FilePath
+ << "\n";);
+ auto BinOrErr = object::createBinary(FilePath);
+ if (!BinOrErr) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to open file " << FilePath
+ << "\n";);
+ return BinOrErr.takeError();
+ }
+
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Successfully opened file " << FilePath
+ << "\n";);
+
+ auto OwningBin = BinOrErr->takeBinary();
+ object::Binary *Bin = OwningBin.first.get();
+
+ if (Bin->isArchive()) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: File is an archive, not supported: "
+ << FilePath << "\n";);
+ return createStringError(std::errc::invalid_argument,
+ "Archive files are not supported: %s",
+ FilePath.str().c_str());
+ }
+
+#if defined(__APPLE__)
+ if (auto *UB = dyn_cast<object::MachOUniversalBinary>(Bin)) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected Mach-O universal binary: "
+ << FilePath << "\n";);
+ for (auto ObjForArch : UB->objects()) {
+ auto ObjOrErr = ObjForArch.getAsObjectFile();
+ if (!ObjOrErr) {
+ LLVM_DEBUG(
+ dbgs()
+ << "ObjectFileLoader: Skipping invalid architecture slice\n";);
+
+ consumeError(ObjOrErr.takeError());
+ continue;
+ }
+
+ std::unique_ptr<object::ObjectFile> Obj = std::move(ObjOrErr.get());
+ if (isArchitectureCompatible(*Obj)) {
+ LLVM_DEBUG(
+ dbgs() << "ObjectFileLoader: Found compatible object slice\n";);
+
+ return object::OwningBinary<object::ObjectFile>(
+ std::move(Obj), std::move(OwningBin.second));
+
+ } else {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture "
+ "slice skipped\n";);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: No compatible slices found in "
+ "universal binary\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "No compatible object found in fat binary: %s",
+ FilePath.str().c_str());
+ }
+#endif
+
+ auto ObjOrErr =
+ object::ObjectFile::createObjectFile(Bin->getMemoryBufferRef());
+ if (!ObjOrErr) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Failed to create object file\n";);
+ return ObjOrErr.takeError();
+ }
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Detected object file\n";);
+
+ std::unique_ptr<object::ObjectFile> Obj = std::move(*ObjOrErr);
+ if (!isArchitectureCompatible(*Obj)) {
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Incompatible architecture: "
+ << FilePath << "\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "Incompatible object file: %s",
+ FilePath.str().c_str());
+ }
+
+ LLVM_DEBUG(dbgs() << "ObjectFileLoader: Object file is compatible\n";);
+
+ return object::OwningBinary<object::ObjectFile>(std::move(Obj),
+ std::move(OwningBin.second));
+}
+
+template <class ELFT>
+bool isELFSharedLibrary(const object::ELFFile<ELFT> &ELFObj) {
+ if (ELFObj.getHeader().e_type != ELF::ET_DYN)
+ return false;
+
+ auto PHOrErr = ELFObj.program_headers();
+ if (!PHOrErr) {
+ consumeError(PHOrErr.takeError());
+ return true;
+ }
+
+ for (auto Phdr : *PHOrErr) {
+ if (Phdr.p_type == ELF::PT_INTERP)
+ return false;
+ }
+
+ return true;
+}
+
+bool isSharedLibraryObject(object::ObjectFile &Obj) {
+ if (Obj.isELF()) {
+ if (auto *ELF32LE = dyn_cast<object::ELF32LEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF32LE->getELFFile());
+ if (auto *ELF64LE = dyn_cast<object::ELF64LEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF64LE->getELFFile());
+ if (auto *ELF32BE = dyn_cast<object::ELF32BEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF32BE->getELFFile());
+ if (auto *ELF64BE = dyn_cast<object::ELF64BEObjectFile>(&Obj))
+ return isELFSharedLibrary(ELF64BE->getELFFile());
+ } else if (Obj.isMachO()) {
+ const object::MachOObjectFile *MachO =
+ dyn_cast<object::MachOObjectFile>(&Obj);
+ if (!MachO) {
+ LLVM_DEBUG(dbgs() << "Failed to cast to MachOObjectFile.\n";);
+ return false;
+ }
+ LLVM_DEBUG({
+ bool Result =
+ MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+ dbgs() << "Mach-O filetype: " << MachO->getHeader().filetype
+ << " (MH_DYLIB == " << MachO::HeaderFileType::MH_DYLIB
+ << "), shared: " << Result << "\n";
+ });
+
+ return MachO->getHeader().filetype == MachO::HeaderFileType::MH_DYLIB;
+ } else if (Obj.isCOFF()) {
+ const object::COFFObjectFile *coff = dyn_cast<object::COFFObjectFile>(&Obj);
+ if (!coff)
+ return false;
+ return coff->getCharacteristics() & COFF::IMAGE_FILE_DLL;
+ } else {
+ LLVM_DEBUG(dbgs() << "Binary is not an ObjectFile.\n";);
+ }
+
+ return false;
+}
+
+bool DylibPathValidator::isSharedLibrary(StringRef Path) {
+ LLVM_DEBUG(dbgs() << "Checking if path is a shared library: " << Path
+ << "\n";);
+
+ auto FileType = sys::fs::get_file_type(Path, /*Follow*/ true);
+ if (FileType != sys::fs::file_type::regular_file) {
+ LLVM_DEBUG(dbgs() << "File type is not a regular file for path: " << Path
+ << "\n";);
+ return false;
+ }
+
+ file_magic MagicCode;
+ identify_magic(Path, MagicCode);
+
+ // Skip archives.
+ if (MagicCode == file_magic::archive)
+ return false;
+
+ // Universal binary handling.
+#if defined(__APPLE__)
+ if (MagicCode == file_magic::macho_universal_binary) {
+ ObjectFileLoader ObjLoader(Path);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ consumeError(ObjOrErr.takeError());
+ return false;
+ }
+ return isSharedLibraryObject(ObjOrErr.get());
+ }
+#endif
+
+ // Object file inspection for PE/COFF, ELF, and Mach-O
+ bool NeedsObjectInspection =
+#if defined(_WIN32)
+ (MagicCode == file_magic::pecoff_executable);
+#elif defined(__APPLE__)
+ (MagicCode == file_magic::macho_fixed_virtual_memory_shared_lib ||
+ MagicCode == file_magic::macho_dynamically_linked_shared_lib ||
+ MagicCode == file_magic::macho_dynamically_linked_shared_lib_stub);
+#elif defined(LLVM_ON_UNIX)
+#ifdef __CYGWIN__
+ (MagicCode == file_magic::pecoff_executable);
+#else
+ (MagicCode == file_magic::elf_shared_object);
+#endif
+#else
+#error "Unsupported platform."
+#endif
+
+ if (NeedsObjectInspection) {
+ ObjectFileLoader ObjLoader(Path);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ consumeError(ObjOrErr.takeError());
+ return false;
+ }
+ return isSharedLibraryObject(ObjOrErr.get());
+ }
+
+ LLVM_DEBUG(dbgs() << "Path is not identified as a shared library: " << Path
+ << "\n";);
+ return false;
+}
+
+void DylibSubstitutor::configure(StringRef LoaderPath) {
+ SmallString<512> ExecPath(sys::fs::getMainExecutable(nullptr, nullptr));
+ sys::path::remove_filename(ExecPath);
+
+ SmallString<512> LoaderDir;
+ if (LoaderPath.empty()) {
+ LoaderDir = ExecPath;
+ } else {
+ LoaderDir = LoaderPath.str();
+ if (!sys::fs::is_directory(LoaderPath))
+ sys::path::remove_filename(LoaderDir);
+ }
+
+#ifdef __APPLE__
+ Placeholders["@loader_path"] = std::string(LoaderDir);
+ Placeholders["@executable_path"] = std::string(ExecPath);
+#else
+ Placeholders["$origin"] = std::string(LoaderDir);
+#endif
+}
+
+std::optional<std::string>
+SearchPathResolver::resolve(StringRef Stem, const DylibSubstitutor &Subst,
+ DylibPathValidator &Validator) const {
+ for (const auto &SP : Paths) {
+ std::string Base = Subst.substitute(SP);
+
+ SmallString<512> FullPath(Base);
+ if (!PlaceholderPrefix.empty() &&
+ Stem.starts_with_insensitive(PlaceholderPrefix))
+ FullPath.append(Stem.drop_front(PlaceholderPrefix.size()));
+ else
+ sys::path::append(FullPath, Stem);
+
+ LLVM_DEBUG(dbgs() << "SearchPathResolver::resolve FullPath = " << FullPath
+ << "\n";);
+
+ if (auto Valid = Validator.validate(FullPath.str()))
+ return Valid;
+ }
+
+ return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::tryWithExtensions(StringRef LibStem) const {
+ LLVM_DEBUG(dbgs() << "tryWithExtensions: baseName = " << LibStem << "\n";);
+ SmallVector<SmallString<256>, 8> Candidates;
+
+ // Add extensions by platform
+#if defined(__APPLE__)
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".dylib";
+#elif defined(_WIN32)
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".dll";
+#else
+ Candidates.emplace_back(LibStem);
+ Candidates.back() += ".so";
+#endif
+
+ // Optionally try "lib" prefix if not already there
+ StringRef FileName = sys::path::filename(LibStem);
+ StringRef Base = sys::path::parent_path(LibStem);
+ if (!FileName.starts_with("lib")) {
+ SmallString<256> WithPrefix(Base);
+ if (!WithPrefix.empty())
+ sys::path::append(WithPrefix, ""); // ensure separator if needed
+ WithPrefix += "lib";
+ WithPrefix += FileName;
+
+#if defined(__APPLE__)
+ WithPrefix += ".dylib";
+#elif defined(_WIN32)
+ WithPrefix += ".dll";
+#else
+ WithPrefix += ".so";
+#endif
+
+ Candidates.push_back(std::move(WithPrefix));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << " Candidates to try:\n";
+ for (const auto &C : Candidates)
+ dbgs() << " " << C << "\n";
+ });
+
+ // Try all variants using tryAllPaths
+ for (const auto &Name : Candidates) {
+
+ LLVM_DEBUG(dbgs() << " Trying candidate: " << Name << "\n";);
+
+ for (const auto &R : Resolvers) {
+ if (auto Res = R.resolve(Name, Substitutor, Validator))
+ return Res;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " -> No candidate Resolved.\n";);
+
+ return std::nullopt;
+}
+
+std::optional<std::string>
+DylibResolverImpl::resolve(StringRef LibStem, bool VariateLibStem) const {
+ LLVM_DEBUG(dbgs() << "Resolving library stem: " << LibStem << "\n";);
+
+ // If it is an absolute path, don't try iterate over the paths.
+ if (sys::path::is_absolute(LibStem)) {
+ LLVM_DEBUG(dbgs() << " -> Absolute path detected.\n";);
+ return Validator.validate(LibStem);
+ }
+
+ if (!LibStem.starts_with_insensitive("@rpath")) {
+ if (auto norm = Validator.validate(Substitutor.substitute(LibStem))) {
+ LLVM_DEBUG(dbgs() << " -> Resolved after substitution: " << *norm
+ << "\n";);
+
+ return norm;
+ }
+ }
+
+ for (const auto &R : Resolvers) {
+ LLVM_DEBUG(dbgs() << " -> Resolving via search path ... \n";);
+ if (auto Result = R.resolve(LibStem, Substitutor, Validator)) {
+ LLVM_DEBUG(dbgs() << " -> Resolved via search path: " << *Result
+ << "\n";);
+
+ return Result;
+ }
+ }
+
+ // Expand libStem with paths, extensions, etc.
+ // std::string foundName;
+ if (VariateLibStem) {
+ LLVM_DEBUG(dbgs() << " -> Trying with extensions...\n";);
+
+ if (auto Norm = tryWithExtensions(LibStem)) {
+ LLVM_DEBUG(dbgs() << " -> Resolved via tryWithExtensions: " << *Norm
+ << "\n";);
+
+ return Norm;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " -> Could not resolve: " << LibStem << "\n";);
+
+ return std::nullopt;
+}
+
+#ifndef _WIN32
+mode_t PathResolver::lstatCached(StringRef Path) {
+ // If already cached - retun cached result
+ if (auto Cache = LibPathCache->read_lstat(Path))
+ return *Cache;
+
+ // Not cached: perform lstat and store
+ struct stat buf{};
+ mode_t st_mode = (lstat(Path.str().c_str(), &buf) == -1) ? 0 : buf.st_mode;
+
+ LibPathCache->insert_lstat(Path, st_mode);
+
+ return st_mode;
+}
+
+std::optional<std::string> PathResolver::readlinkCached(StringRef Path) {
+ // If already cached - retun cached result
+ if (auto Cache = LibPathCache->read_link(Path))
+ return Cache;
+
+ // If result not in cache - call system function and cache result
+ char buf[PATH_MAX];
+ ssize_t len;
+ if ((len = readlink(Path.str().c_str(), buf, sizeof(buf))) != -1) {
+ buf[len] = '\0';
+ std::string s(buf);
+ LibPathCache->insert_link(Path, s);
+ return s;
+ }
+ return std::nullopt;
+}
+
+void createComponent(StringRef Path, StringRef BasePath, bool BaseIsResolved,
+ SmallVector<StringRef, 16> &Component) {
+ StringRef Separator = sys::path::get_separator();
+ if (!BaseIsResolved) {
+ if (Path[0] == '~' &&
+ (Path.size() == 1 || sys::path::is_separator(Path[1]))) {
+ static SmallString<128> HomeP;
+ if (HomeP.str().empty())
+ sys::path::home_directory(HomeP);
+ StringRef(HomeP).split(Component, Separator, /*MaxSplit*/ -1,
+ /*KeepEmpty*/ false);
+ } else if (BasePath.empty()) {
+ static SmallString<256> CurrentPath;
+ if (CurrentPath.str().empty())
+ sys::fs::current_path(CurrentPath);
+ StringRef(CurrentPath)
+ .split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+ } else {
+ BasePath.split(Component, Separator, /*MaxSplit*/ -1,
+ /*KeepEmpty*/ false);
+ }
+ }
+
+ Path.split(Component, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+}
+
+void normalizePathSegments(SmallVector<StringRef, 16> &PathParts) {
+ SmallVector<StringRef, 16> NormalizedPath;
+ for (auto &Part : PathParts) {
+ if (Part == ".") {
+ continue;
+ } else if (Part == "..") {
+ if (!NormalizedPath.empty() && NormalizedPath.back() != "..") {
+ NormalizedPath.pop_back();
+ } else {
+ NormalizedPath.push_back("..");
+ }
+ } else {
+ NormalizedPath.push_back(Part);
+ }
+ }
+ PathParts.swap(NormalizedPath);
+}
+#endif
+
+std::optional<std::string> PathResolver::realpathCached(StringRef Path,
+ std::error_code &EC,
+ StringRef Base,
+ bool BaseIsResolved,
+ long SymLoopLevel) {
+ EC.clear();
+
+ if (Path.empty()) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Empty path\n";);
+
+ return std::nullopt;
+ }
+
+ if (SymLoopLevel <= 0) {
+ EC = std::make_error_code(std::errc::too_many_symbolic_link_levels);
+ LLVM_DEBUG(
+ dbgs() << "PathResolver::realpathCached: Too many Symlink levels: "
+ << Path << "\n";);
+
+ return std::nullopt;
+ }
+
+ // If already cached - retun cached result
+ bool isRelative = sys::path::is_relative(Path);
+ if (!isRelative) {
+ if (auto Cached = LibPathCache->read_realpath(Path)) {
+ EC = Cached->ErrnoCode;
+ if (EC) {
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Cached (error) for "
+ << Path << "\n";);
+ } else {
+ LLVM_DEBUG(
+ dbgs() << "PathResolver::realpathCached: Cached (success) for "
+ << Path << " => " << Cached->canonicalPath << "\n";);
+ }
+ return Cached->canonicalPath.empty()
+ ? std::nullopt
+ : std::make_optional(Cached->canonicalPath);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Resolving path: " << Path
+ << "\n";);
+
+ // If result not in cache - call system function and cache result
+
+ StringRef Separator(sys::path::get_separator());
+ SmallString<256> Resolved(Separator);
+#ifndef _WIN32
+ SmallVector<StringRef, 16> Components;
+
+ if (isRelative) {
+ if (BaseIsResolved) {
+ Resolved.assign(Base);
+ LLVM_DEBUG(dbgs() << " Using Resolved base: " << Base << "\n";);
+ }
+ createComponent(Path, Base, BaseIsResolved, Components);
+ } else {
+ Path.split(Components, Separator, /*MaxSplit*/ -1, /*KeepEmpty*/ false);
+ }
+
+ normalizePathSegments(Components);
+ LLVM_DEBUG({
+ for (auto &C : Components)
+ dbgs() << " " << C << " ";
+
+ dbgs() << "\n";
+ });
+
+ // Handle path list items
+ for (const auto &Component : Components) {
+ if (Component == ".")
+ continue;
+ if (Component == "..") {
+ // collapse "a/b/../c" to "a/c"
+ size_t S = Resolved.rfind(Separator);
+ if (S != llvm::StringRef::npos)
+ Resolved.resize(S);
+ if (Resolved.empty())
+ Resolved = Separator;
+ continue;
+ }
+
+ size_t oldSize = Resolved.size();
+ sys::path::append(Resolved, Component);
+ const char *ResolvedPath = Resolved.c_str();
+ LLVM_DEBUG(dbgs() << " Processing Component: " << Component << " => "
+ << ResolvedPath << "\n";);
+ mode_t st_mode = lstatCached(ResolvedPath);
+
+ if (S_ISLNK(st_mode)) {
+ LLVM_DEBUG(dbgs() << " Found symlink: " << ResolvedPath << "\n";);
+
+ auto SymlinkOpt = readlinkCached(ResolvedPath);
+ if (!SymlinkOpt) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Failed to read symlink: " << ResolvedPath
+ << "\n";);
+
+ return std::nullopt;
+ }
+
+ StringRef Symlink = *SymlinkOpt;
+ LLVM_DEBUG(dbgs() << " Symlink points to: " << Symlink << "\n";);
+
+ std::string resolvedBase = "";
+ if (sys::path::is_relative(Symlink)) {
+ Resolved.resize(oldSize);
+ resolvedBase = Resolved.str().str();
+ }
+
+ auto RealSymlink =
+ realpathCached(Symlink, EC, resolvedBase,
+ /*BaseIsResolved=*/true, SymLoopLevel - 1);
+ if (!RealSymlink) {
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Failed to resolve symlink target: " << Symlink
+ << "\n";);
+
+ return std::nullopt;
+ }
+
+ Resolved.assign(*RealSymlink);
+ LLVM_DEBUG(dbgs() << " Symlink Resolved to: " << Resolved << "\n";);
+
+ } else if (st_mode == 0) {
+ EC = std::make_error_code(std::errc::no_such_file_or_directory);
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{"", EC});
+ LLVM_DEBUG(dbgs() << " Component does not exist: " << ResolvedPath
+ << "\n";);
+
+ return std::nullopt;
+ }
+ }
+#else
+ EC = sys::fs::real_path(Path, Resolved); // Windows fallback
+#endif
+
+ std::string Canonical = Resolved.str().str();
+ {
+ LibPathCache->insert_realpath(Path, LibraryPathCache::PathInfo{
+ Canonical,
+ std::error_code() // success
+ });
+ }
+ LLVM_DEBUG(dbgs() << "PathResolver::realpathCached: Final Resolved: " << Path
+ << " => " << Canonical << "\n";);
+ return Canonical;
+}
+
+void LibraryScanHelper::addBasePath(const std::string &Path, PathType K) {
+ std::error_code EC;
+ std::string Canon = resolveCanonical(Path, EC);
+ if (EC) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LibraryScanHelper::addBasePath: Failed to canonicalize path: "
+ << Path << "\n";);
+ return;
+ }
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+ if (LibSearchPaths.count(Canon)) {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Already added: "
+ << Canon << "\n";);
+ return;
+ }
+ K = K == PathType::Unknown ? classifyKind(Canon) : K;
+ auto SP = std::make_shared<LibrarySearchPath>(Canon, K);
+ LibSearchPaths[Canon] = SP;
+
+ if (K == PathType::User) {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added User path: "
+ << Canon << "\n";);
+ UnscannedUsr.push_back(StringRef(SP->BasePath));
+ } else {
+ LLVM_DEBUG(dbgs() << "LibraryScanHelper::addBasePath: Added System path: "
+ << Canon << "\n";);
+ UnscannedSys.push_back(StringRef(SP->BasePath));
+ }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getNextBatch(PathType K, size_t BatchSize) {
+ std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+ auto &Queue = (K == PathType::User) ? UnscannedUsr : UnscannedSys;
+
+ std::unique_lock<std::shared_mutex> Lock(Mtx);
+
+ while (!Queue.empty() && (BatchSize == 0 || Result.size() < BatchSize)) {
+ StringRef Base = Queue.front();
+ auto It = LibSearchPaths.find(Base);
+ if (It != LibSearchPaths.end()) {
+ auto &SP = It->second;
+ ScanState Expected = ScanState::NotScanned;
+ if (SP->State.compare_exchange_strong(Expected, ScanState::Scanning)) {
+ Result.push_back(SP);
+ }
+ }
+ Queue.pop_front();
+ }
+
+ return Result;
+}
+
+bool LibraryScanHelper::isTrackedBasePath(StringRef Path) const {
+ std::error_code EC;
+ std::string Canon = resolveCanonical(Path, EC);
+ if (EC)
+ return false;
+
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ return LibSearchPaths.count(Canon) > 0;
+}
+
+bool LibraryScanHelper::leftToScan(PathType K) const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ for (const auto &KV : LibSearchPaths) {
+ const auto &SP = KV.second;
+ if (SP->Kind == K && SP->State == ScanState::NotScanned)
+ return true;
+ }
+ return false;
+}
+
+void LibraryScanHelper::resetToScan() {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+
+ for (auto &[_, SP] : LibSearchPaths) {
+ ScanState Expected = ScanState::Scanned;
+
+ if (!SP->State.compare_exchange_strong(Expected, ScanState::NotScanned))
+ continue;
+
+ auto &TargetList =
+ (SP->Kind == PathType::User) ? UnscannedUsr : UnscannedSys;
+ TargetList.emplace_back(SP->BasePath);
+ }
+}
+
+std::vector<std::shared_ptr<LibrarySearchPath>>
+LibraryScanHelper::getAllUnits() const {
+ std::shared_lock<std::shared_mutex> Lock(Mtx);
+ std::vector<std::shared_ptr<LibrarySearchPath>> Result;
+ Result.reserve(LibSearchPaths.size());
+ for (const auto &[_, SP] : LibSearchPaths) {
+ Result.push_back(SP);
+ }
+ return Result;
+}
+
+std::string LibraryScanHelper::resolveCanonical(StringRef Path,
+ std::error_code &EC) const {
+ auto Canon = LibPathResolver->resolve(Path, EC);
+ return EC ? Path.str() : *Canon;
+}
+
+PathType LibraryScanHelper::classifyKind(StringRef Path) const {
+ // Detect home directory
+ const char *Home = getenv("HOME");
+ if (Home && Path.find(Home) == 0)
+ return PathType::User;
+
+ static const std::array<std::string, 5> UserPrefixes = {
+ "/usr/local", // often used by users for manual installs
+ "/opt/homebrew", // common on macOS
+ "/opt/local", // MacPorts
+ "/home", // Linux home dirs
+ "/Users", // macOS user dirs
+ };
+
+ for (const auto &Prefix : UserPrefixes) {
+ if (Path.find(Prefix) == 0)
+ return PathType::User;
+ }
+
+ return PathType::System;
+}
+
+Expected<LibraryDepsInfo> parseMachODeps(const object::MachOObjectFile &Obj) {
+ LibraryDepsInfo Libdeps;
+ LLVM_DEBUG(dbgs() << "Parsing Mach-O dependencies...\n";);
+ for (const auto &Command : Obj.load_commands()) {
+ switch (Command.C.cmd) {
+ case MachO::LC_LOAD_DYLIB: {
+ MachO::dylib_command dylibCmd = Obj.getDylibIDLoadCommand(Command);
+ const char *name = Command.Ptr + dylibCmd.dylib.name;
+ Libdeps.addDep(name);
+ LLVM_DEBUG(dbgs() << " Found LC_LOAD_DYLIB: " << name << "\n";);
+ } break;
+ case MachO::LC_LOAD_WEAK_DYLIB:
+ case MachO::LC_REEXPORT_DYLIB:
+ case MachO::LC_LOAD_UPWARD_DYLIB:
+ case MachO::LC_LAZY_LOAD_DYLIB:
+ break;
+ case MachO::LC_RPATH: {
+ // Extract RPATH
+ MachO::rpath_command rpathCmd = Obj.getRpathCommand(Command);
+ const char *rpath = Command.Ptr + rpathCmd.path;
+ LLVM_DEBUG(dbgs() << " Found LC_RPATH: " << rpath << "\n";);
+
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(StringRef(rpath), RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+
+ for (const auto &raw : RawPaths) {
+ Libdeps.addRPath(raw.str()); // Convert to std::string
+ LLVM_DEBUG(dbgs() << " Parsed RPATH entry: " << raw << "\n";);
+ }
+ break;
+ }
+ }
+ }
+
+ return Expected<LibraryDepsInfo>(std::move(Libdeps));
+}
+
+template <class ELFT>
+static Expected<StringRef> getDynamicStrTab(const object::ELFFile<ELFT> &Elf) {
+ auto DynamicEntriesOrError = Elf.dynamicEntries();
+ if (!DynamicEntriesOrError)
+ return DynamicEntriesOrError.takeError();
+
+ for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+ if (Dyn.d_tag == ELF::DT_STRTAB) {
+ auto MappedAddrOrError = Elf.toMappedAddr(Dyn.getPtr());
+ if (!MappedAddrOrError)
+ return MappedAddrOrError.takeError();
+ return StringRef(reinterpret_cast<const char *>(*MappedAddrOrError));
+ }
+ }
+
+ // If the dynamic segment is not present, we fall back on the sections.
+ auto SectionsOrError = Elf.sections();
+ if (!SectionsOrError)
+ return SectionsOrError.takeError();
+
+ for (const typename ELFT::Shdr &Sec : *SectionsOrError) {
+ if (Sec.sh_type == ELF::SHT_DYNSYM)
+ return Elf.getStringTableForSymtab(Sec);
+ }
+
+ return make_error<StringError>("dynamic string table not found",
+ inconvertibleErrorCode());
+}
+
+template <typename ELFT>
+Expected<LibraryDepsInfo> parseELF(const object::ELFFile<ELFT> &Elf) {
+ LibraryDepsInfo Deps;
+ Expected<StringRef> StrTabOrErr = getDynamicStrTab(Elf);
+ if (!StrTabOrErr)
+ return StrTabOrErr.takeError();
+
+ const char *Data = StrTabOrErr->data();
+
+ auto DynamicEntriesOrError = Elf.dynamicEntries();
+ if (!DynamicEntriesOrError) {
+ return DynamicEntriesOrError.takeError();
+ }
+
+ for (const typename ELFT::Dyn &Dyn : *DynamicEntriesOrError) {
+ switch (Dyn.d_tag) {
+ case ELF::DT_NEEDED:
+ Deps.addDep(Data + Dyn.d_un.d_val);
+ break;
+ case ELF::DT_RPATH: {
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(Data + Dyn.d_un.d_val, RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+ for (const auto &raw : RawPaths)
+ Deps.addRPath(raw.str());
+ break;
+ }
+ case ELF::DT_RUNPATH: {
+ SmallVector<StringRef, 4> RawPaths;
+ SplitString(Data + Dyn.d_un.d_val, RawPaths,
+ sys::EnvPathSeparator == ':' ? ":" : ";");
+ for (const auto &raw : RawPaths)
+ Deps.addRunPath(raw.str());
+ break;
+ }
+ case ELF::DT_FLAGS_1:
+ // Check if this is not a pie executable.
+ if (Dyn.d_un.d_val & ELF::DF_1_PIE)
+ Deps.isPIE = true;
+ break;
+ // (Dyn.d_tag == ELF::DT_NULL) continue;
+ // (Dyn.d_tag == ELF::DT_AUXILIARY || Dyn.d_tag == ELF::DT_FILTER)
+ default:
+ break;
+ }
+ }
+
+ return Expected<LibraryDepsInfo>(std::move(Deps));
+}
+
+Expected<LibraryDepsInfo> parseELFDeps(const object::ELFObjectFileBase &Obj) {
+ using namespace object;
+ LLVM_DEBUG(dbgs() << "parseELFDeps: Detected ELF object\n";);
+ if (const auto *ELF = dyn_cast<ELF32LEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF32BEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF64LEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+ else if (const auto *ELF = dyn_cast<ELF64BEObjectFile>(&Obj))
+ return parseELF(ELF->getELFFile());
+
+ LLVM_DEBUG(dbgs() << "parseELFDeps: Unknown ELF format\n";);
+ return createStringError(std::errc::not_supported, "Unknown ELF format");
+}
+
+Expected<LibraryDepsInfo> LibraryScanner::extractDeps(StringRef FilePath) {
+ LLVM_DEBUG(dbgs() << "extractDeps: Attempting to open file " << FilePath
+ << "\n";);
+
+ ObjectFileLoader ObjLoader(FilePath);
+ auto ObjOrErr = ObjLoader.getObjectFile();
+ if (!ObjOrErr) {
+ LLVM_DEBUG(dbgs() << "extractDeps: Failed to open " << FilePath << "\n";);
+ return ObjOrErr.takeError();
+ }
+
+ object::ObjectFile *Obj = &ObjOrErr.get();
+
+ if (auto *elfObj = dyn_cast<object::ELFObjectFileBase>(Obj)) {
+ LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+ << " is an ELF object\n";);
+
+ return parseELFDeps(*elfObj);
+ }
+
+ if (auto *macho = dyn_cast<object::MachOObjectFile>(Obj)) {
+ LLVM_DEBUG(dbgs() << "extractDeps: File " << FilePath
+ << " is a Mach-O object\n";);
+ return parseMachODeps(*macho);
+ }
+
+ if (Obj->isCOFF()) {
+ // TODO: COFF support
+ return LibraryDepsInfo();
+ }
+
+ LLVM_DEBUG(dbgs() << "extractDeps: Unsupported binary format for file "
+ << FilePath << "\n";);
+ return createStringError(inconvertibleErrorCode(),
+ "Unsupported binary format: %s",
+ FilePath.str().c_str());
+}
+
+std::optional<std::string> LibraryScanner::shouldScan(StringRef FilePath) {
+ std::error_code EC;
+
+ LLVM_DEBUG(dbgs() << "[shouldScan] Checking: " << FilePath << "\n";);
+
+ // [1] Check file existence early
+ if (!sys::fs::exists(FilePath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: file does not exist.\n";);
+
+ return std::nullopt;
+ }
+
+ // [2] Resolve to canonical path
+ auto CanonicalPathOpt = ScanHelper.resolve(FilePath, EC);
+ if (EC || !CanonicalPathOpt) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: failed to resolve path (EC="
+ << EC.message() << ").\n";);
+
+ return std::nullopt;
+ }
+
+ const std::string &CanonicalPath = *CanonicalPathOpt;
+ LLVM_DEBUG(dbgs() << " -> Canonical path: " << CanonicalPath << "\n");
+
+ // [3] Check if it's a directory — skip directories
+ if (sys::fs::is_directory(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: path is a directory.\n";);
+
+ return std::nullopt;
+ }
+
+ // [4] Skip if it's not a shared library.
+ if (!DylibPathValidator::isSharedLibrary(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: not a shared library.\n";);
+ return std::nullopt;
+ }
+
+ // [5] Skip if we've already seen this path (via cache)
+ if (ScanHelper.hasSeenOrMark(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: already seen.\n";);
+
+ return std::nullopt;
+ }
+
+ // [6] Already tracked in LibraryManager?
+ if (LibMgr.hasLibrary(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: already tracked by LibraryManager.\n";);
+
+ return std::nullopt;
+ }
+
+ // [7] Run user-defined hook (default: always true)
+ if (!ShouldScanCall(CanonicalPath)) {
+ LLVM_DEBUG(dbgs() << " -> Skipped: user-defined hook rejected.\n";);
+
+ return std::nullopt;
+ }
+
+ LLVM_DEBUG(dbgs() << " -> Accepted: ready to scan " << CanonicalPath
+ << "\n";);
+ return CanonicalPath;
+}
+
+void LibraryScanner::handleLibrary(StringRef FilePath, PathType K, int level) {
+ LLVM_DEBUG(dbgs() << "LibraryScanner::handleLibrary: Scanning: " << FilePath
+ << ", level=" << level << "\n";);
+ auto CanonPathOpt = shouldScan(FilePath);
+ if (!CanonPathOpt) {
+ LLVM_DEBUG(dbgs() << " Skipped (shouldScan returned false): " << FilePath
+ << "\n";);
+
+ return;
+ }
+ const std::string CanonicalPath = *CanonPathOpt;
+
+ auto DepsOrErr = extractDeps(CanonicalPath);
+ if (!DepsOrErr) {
+ LLVM_DEBUG(dbgs() << " Failed to extract deps for: " << CanonicalPath
+ << "\n";);
+ handleError(DepsOrErr.takeError());
+ return;
+ }
+
+ LibraryDepsInfo &Deps = *DepsOrErr;
+
+ LLVM_DEBUG({
+ dbgs() << " Found deps : \n";
+ for (const auto &dep : Deps.deps)
+ dbgs() << " : " << dep << "\n";
+ dbgs() << " Found @rpath : " << Deps.rpath.size() << "\n";
+ for (const auto &r : Deps.rpath)
+ dbgs() << " : " << r << "\n";
+ dbgs() << " Found @runpath : \n";
+ for (const auto &r : Deps.runPath)
+ dbgs() << " : " << r << "\n";
+ });
+
+ if (Deps.isPIE && level == 0) {
+ LLVM_DEBUG(dbgs() << " Skipped PIE executable at top level: "
+ << CanonicalPath << "\n";);
+
+ return;
+ }
+
+ bool Added = LibMgr.addLibrary(CanonicalPath, K);
+ if (!Added) {
+ LLVM_DEBUG(dbgs() << " Already added: " << CanonicalPath << "\n";);
+ return;
+ }
+
+ // Heuristic 1: No RPATH/RUNPATH, skip deps
+ if (Deps.rpath.empty() && Deps.runPath.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic1): "
+ << CanonicalPath << "\n";);
+ return;
+ }
+
+ // Heuristic 2: All RPATH and RUNPATH already tracked
+ auto allTracked = [&](const auto &Paths) {
+ LLVM_DEBUG(dbgs() << " Checking : " << Paths.size() << "\n";);
+ return std::all_of(Paths.begin(), Paths.end(), [&](StringRef P) {
+ LLVM_DEBUG(dbgs() << " Checking isTrackedBasePath : " << P << "\n";);
+ return ScanHelper.isTrackedBasePath(
+ DylibResolver::resolvelinkerFlag(P, CanonicalPath));
+ });
+ };
+
+ if (allTracked(Deps.rpath) && allTracked(Deps.runPath)) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::handleLibrary: Skipping deps (Heuristic2): "
+ << CanonicalPath << "\n";);
+ return;
+ }
+
+ DylibPathValidator Validator(ScanHelper.getPathResolver());
+ DylibResolver Resolver(Validator);
+ Resolver.configure(CanonicalPath,
+ {{Deps.rpath, SearchPathType::RPath},
+ {ScanHelper.getSearchPaths(), SearchPathType::UsrOrSys},
+ {Deps.runPath, SearchPathType::RunPath}});
+ for (StringRef Dep : Deps.deps) {
+ LLVM_DEBUG(dbgs() << " Resolving dep: " << Dep << "\n";);
+ auto DepFullOpt = Resolver.resolve(Dep);
+ if (!DepFullOpt) {
+ LLVM_DEBUG(dbgs() << " Failed to resolve dep: " << Dep << "\n";);
+
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << " Resolved dep to: " << *DepFullOpt << "\n";);
+
+ handleLibrary(*DepFullOpt, K, level + 1);
+ }
+}
+
+void LibraryScanner::scanBaseDir(std::shared_ptr<LibrarySearchPath> SP) {
+ if (!sys::fs::is_directory(SP->BasePath) || SP->BasePath.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "LibraryScanner::scanBaseDir: Invalid or empty basePath: "
+ << SP->BasePath << "\n";);
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "LibraryScanner::scanBaseDir: Scanning directory: "
+ << SP->BasePath << "\n";);
+ std::error_code EC;
+
+ SP->State.store(ScanState::Scanning);
+
+ for (sys::fs::directory_iterator It(SP->BasePath, EC), end; It != end && !EC;
+ It.increment(EC)) {
+ auto Entry = *It;
+ if (!Entry.status())
+ continue;
+
+ auto Status = *Entry.status();
+ if (sys::fs::is_regular_file(Status) || sys::fs::is_symlink_file(Status)) {
+ LLVM_DEBUG(dbgs() << " Found file: " << Entry.path() << "\n";);
+ // async support ?
+ handleLibrary(Entry.path(), SP->Kind);
+ }
+ }
+
+ SP->State.store(ScanState::Scanned);
+}
+
+void LibraryScanner::scanNext(PathType K, size_t BatchSize) {
+ LLVM_DEBUG(dbgs() << "LibraryScanner::scanNext: Scanning next batch of size "
+ << BatchSize << " for kind "
+ << (K == PathType::User ? "User" : "System") << "\n";);
+
+ auto SearchPaths = ScanHelper.getNextBatch(K, BatchSize);
+ for (auto &SP : SearchPaths) {
+ LLVM_DEBUG(dbgs() << " Scanning unit with basePath: " << SP->BasePath
+ << "\n";);
+
+ scanBaseDir(SP);
+ }
+}
+
+} // end namespace llvm::orc
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 648d6a5..da68994 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -421,8 +421,13 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string *ErrMsg) {
return true;
}
- if (FilesToRemove == NULL)
+ if (FilesToRemove == NULL) {
FilesToRemove = new std::vector<std::string>;
+ std::atexit([]() {
+ delete FilesToRemove;
+ FilesToRemove = NULL;
+ });
+ }
FilesToRemove->push_back(std::string(Filename));
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 6dcbced..b7fa899 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
}
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
+ // On entry to a block with multiple predescessors, there may
+ // be pending SMEM and VMEM events active at the same time.
+ // In such cases, only clear one active event at a time.
+ auto applyPendingXcntGroup = [this](unsigned E) {
+ unsigned LowerBound = getScoreLB(X_CNT);
+ applyWaitcnt(X_CNT, 0);
+ PendingEvents |= (1 << E);
+ setScoreLB(X_CNT, LowerBound);
+ };
+
// Wait on XCNT is redundant if we are already waiting for a load to complete.
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
// zero.
- if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
- return applyWaitcnt(X_CNT, 0);
+ if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
+ if (hasPendingEvent(VMEM_GROUP))
+ applyPendingXcntGroup(VMEM_GROUP);
+ else
+ applyWaitcnt(X_CNT, 0);
+ return;
+ }
// If we have pending store we cannot optimize XCnt because we do not wait for
// stores. VMEM loads retun in order, so if we only have loads XCnt is
// decremented to the same number as LOADCnt.
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
- !hasPendingEvent(STORE_CNT))
- return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ !hasPendingEvent(STORE_CNT)) {
+ if (hasPendingEvent(SMEM_GROUP))
+ applyPendingXcntGroup(SMEM_GROUP);
+ else
+ applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
+ return;
+ }
applyWaitcnt(X_CNT, Wait.XCnt);
}
diff --git a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
index e5b4f6e..ab4ee55 100644
--- a/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYISelLowering.cpp
@@ -889,7 +889,7 @@ CSKYTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
.Cases("{t9}", "{bsp}", CSKY::R25)
.Case("{r26}", CSKY::R26)
.Case("{r27}", CSKY::R27)
- .Cases("{gb}", "{rgb}", "{rdb}", CSKY::R28)
+ .Cases({"{gb}", "{rgb}", "{rdb}"}, CSKY::R28)
.Cases("{tb}", "{rtb}", CSKY::R29)
.Case("{svbr}", CSKY::R30)
.Case("{tls}", CSKY::R31)
diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
index fe83dc6..51bafe4 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kAsmBackend.cpp
@@ -49,7 +49,7 @@ public:
M68kAsmBackend(const Target &T, const MCSubtargetInfo &STI)
: MCAsmBackend(llvm::endianness::big),
Allows32BitBranch(llvm::StringSwitch<bool>(STI.getCPU())
- .CasesLower("m68020", "m68030", "m68040", true)
+ .CasesLower({"m68020", "m68030", "m68040"}, true)
.Default(false)) {}
void applyFixup(const MCFragment &, const MCFixup &, const MCValue &,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 007074c..133406b 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22861,6 +22861,13 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
if (!OpVT.isScalarInteger() || OpSize < 128)
return SDValue();
+ // Don't do this if we're not supposed to use the FPU.
+ bool NoImplicitFloatOps =
+ DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+ if (Subtarget.useSoftFloat() || NoImplicitFloatOps)
+ return SDValue();
+
// Ignore a comparison with zero because that gets special treatment in
// EmitTest(). But make an exception for the special case of a pair of
// logically-combined vector-sized operands compared to zero. This pattern may
@@ -22883,13 +22890,9 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
// Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
- bool NoImplicitFloatOps =
- DAG.getMachineFunction().getFunction().hasFnAttribute(
- Attribute::NoImplicitFloat);
- if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
- ((OpSize == 128 && Subtarget.hasSSE2()) ||
- (OpSize == 256 && Subtarget.hasAVX()) ||
- (OpSize == 512 && Subtarget.useAVX512Regs()))) {
+ if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+ (OpSize == 256 && Subtarget.hasAVX()) ||
+ (OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
// PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
@@ -53344,105 +53347,6 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// Look for a RMW operation that only touches one bit of a larger than legal
-// type and fold it to a BTC/BTR/BTS or bit insertion pattern acting on a single
-// i32 sub value.
-static SDValue narrowBitOpRMW(StoreSDNode *St, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- using namespace SDPatternMatch;
-
- // Only handle normal stores and its chain was a matching normal load.
- auto *Ld = dyn_cast<LoadSDNode>(St->getChain());
- if (!ISD::isNormalStore(St) || !St->isSimple() || !Ld ||
- !ISD::isNormalLoad(Ld) || !Ld->isSimple() ||
- Ld->getBasePtr() != St->getBasePtr() ||
- Ld->getOffset() != St->getOffset())
- return SDValue();
-
- SDValue LoadVal(Ld, 0);
- SDValue StoredVal = St->getValue();
- EVT VT = StoredVal.getValueType();
-
- // Only narrow larger than legal scalar integers.
- if (!VT.isScalarInteger() ||
- VT.getSizeInBits() <= (Subtarget.is64Bit() ? 64 : 32))
- return SDValue();
-
- // BTR: X & ~(1 << ShAmt)
- // BTS: X | (1 << ShAmt)
- // BTC: X ^ (1 << ShAmt)
- //
- // BitInsert: (X & ~(1 << ShAmt)) | (InsertBit << ShAmt)
- SDValue InsertBit, ShAmt;
- if (!StoredVal.hasOneUse() ||
- !(sd_match(StoredVal, m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt))))) ||
- sd_match(StoredVal,
- m_Or(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Xor(m_Specific(LoadVal), m_Shl(m_One(), m_Value(ShAmt)))) ||
- sd_match(StoredVal,
- m_Or(m_And(m_Specific(LoadVal),
- m_Not(m_Shl(m_One(), m_Value(ShAmt)))),
- m_Shl(m_Value(InsertBit), m_Deferred(ShAmt))))))
- return SDValue();
-
- // Ensure the shift amount is in bounds.
- KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
- if (KnownAmt.getMaxValue().uge(VT.getSizeInBits()))
- return SDValue();
-
- // If we're inserting a bit then it must be the LSB.
- if (InsertBit) {
- KnownBits KnownInsert = DAG.computeKnownBits(InsertBit);
- if (KnownInsert.countMinLeadingZeros() < (VT.getSizeInBits() - 1))
- return SDValue();
- }
-
- // Split the shift into an alignment shift that moves the active i32 block to
- // the bottom bits for truncation and a modulo shift that can act on the i32.
- EVT AmtVT = ShAmt.getValueType();
- SDValue AlignAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getSignedConstant(-32LL, DL, AmtVT));
- SDValue ModuloAmt =
- DAG.getNode(ISD::AND, DL, AmtVT, ShAmt, DAG.getConstant(31, DL, AmtVT));
- ModuloAmt = DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8);
-
- // Compute the byte offset for the i32 block that is changed by the RMW.
- // combineTruncate will adjust the load for us in a similar way.
- EVT PtrVT = St->getBasePtr().getValueType();
- SDValue PtrBitOfs = DAG.getZExtOrTrunc(AlignAmt, DL, PtrVT);
- SDValue PtrByteOfs = DAG.getNode(ISD::SRL, DL, PtrVT, PtrBitOfs,
- DAG.getShiftAmountConstant(3, PtrVT, DL));
- SDValue NewPtr = DAG.getMemBasePlusOffset(St->getBasePtr(), PtrByteOfs, DL,
- SDNodeFlags::NoUnsignedWrap);
-
- // Reconstruct the BTC/BTR/BTS pattern for the i32 block and store.
- SDValue X = DAG.getNode(ISD::SRL, DL, VT, LoadVal, AlignAmt);
- X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
-
- SDValue Mask = DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getConstant(1, DL, MVT::i32), ModuloAmt);
-
- SDValue Res;
- if (InsertBit) {
- SDValue BitMask =
- DAG.getNode(ISD::SHL, DL, MVT::i32,
- DAG.getZExtOrTrunc(InsertBit, DL, MVT::i32), ModuloAmt);
- Res =
- DAG.getNode(ISD::AND, DL, MVT::i32, X, DAG.getNOT(DL, Mask, MVT::i32));
- Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, BitMask);
- } else {
- if (StoredVal.getOpcode() == ISD::AND)
- Mask = DAG.getNOT(DL, Mask, MVT::i32);
- Res = DAG.getNode(StoredVal.getOpcode(), DL, MVT::i32, X, Mask);
- }
-
- return DAG.getStore(St->getChain(), DL, Res, NewPtr, St->getPointerInfo(),
- Align(), St->getMemOperand()->getFlags());
-}
-
static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -53669,9 +53573,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
}
}
- if (SDValue R = narrowBitOpRMW(St, dl, DAG, Subtarget))
- return R;
-
// Convert store(cmov(load(p), x, CC), p) to cstore(x, p, CC)
// store(cmov(x, load(p), CC), p) to cstore(x, p, InvertCC)
if ((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
@@ -54604,9 +54505,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
// truncation, see if we can convert the shift into a pointer offset instead.
// Limit this to normal (non-ext) scalar integer loads.
if (SrcVT.isScalarInteger() && Src.getOpcode() == ISD::SRL &&
- Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
- (Src.getOperand(0).hasOneUse() ||
- !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, SrcVT))) {
+ Src.hasOneUse() && Src.getOperand(0).hasOneUse() &&
+ ISD::isNormalLoad(Src.getOperand(0).getNode())) {
auto *Ld = cast<LoadSDNode>(Src.getOperand(0));
if (Ld->isSimple() && VT.isByteSized() &&
isPowerOf2_64(VT.getSizeInBits())) {
@@ -56406,7 +56306,6 @@ static SDValue combineAVX512SetCCToKMOV(EVT VT, SDValue Op0, ISD::CondCode CC,
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- using namespace SDPatternMatch;
const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
const SDValue LHS = N->getOperand(0);
const SDValue RHS = N->getOperand(1);
@@ -56465,37 +56364,6 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (SDValue AndN = MatchAndCmpEq(RHS, LHS))
return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC);
- // If we're performing a bit test on a larger than legal type, attempt
- // to (aligned) shift down the value to the bottom 32-bits and then
- // perform the bittest on the i32 value.
- // ICMP_ZERO(AND(X,SHL(1,IDX)))
- // --> ICMP_ZERO(AND(TRUNC(SRL(X,AND(IDX,-32))),SHL(1,AND(IDX,31))))
- if (isNullConstant(RHS) &&
- OpVT.getScalarSizeInBits() > (Subtarget.is64Bit() ? 64 : 32)) {
- SDValue X, ShAmt;
- if (sd_match(LHS, m_OneUse(m_And(m_Value(X),
- m_Shl(m_One(), m_Value(ShAmt)))))) {
- // Only attempt this if the shift amount is known to be in bounds.
- KnownBits KnownAmt = DAG.computeKnownBits(ShAmt);
- if (KnownAmt.getMaxValue().ult(OpVT.getScalarSizeInBits())) {
- EVT AmtVT = ShAmt.getValueType();
- SDValue AlignAmt =
- DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getSignedConstant(-32LL, DL, AmtVT));
- SDValue ModuloAmt = DAG.getNode(ISD::AND, DL, AmtVT, ShAmt,
- DAG.getConstant(31, DL, AmtVT));
- SDValue Mask = DAG.getNode(
- ISD::SHL, DL, MVT::i32, DAG.getConstant(1, DL, MVT::i32),
- DAG.getZExtOrTrunc(ModuloAmt, DL, MVT::i8));
- X = DAG.getNode(ISD::SRL, DL, OpVT, X, AlignAmt);
- X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X);
- X = DAG.getNode(ISD::AND, DL, MVT::i32, X, Mask);
- return DAG.getSetCC(DL, VT, X, DAG.getConstant(0, DL, MVT::i32),
- CC);
- }
- }
- }
-
// cmpeq(trunc(x),C) --> cmpeq(x,C)
// cmpne(trunc(x),C) --> cmpne(x,C)
// iff x upper bits are zero.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 3fed003..5298728 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -280,8 +280,7 @@ public:
VPValue *createElementCount(Type *Ty, ElementCount EC) {
VPlan &Plan = *getInsertBlock()->getPlan();
- VPValue *RuntimeEC =
- Plan.getOrAddLiveIn(ConstantInt::get(Ty, EC.getKnownMinValue()));
+ VPValue *RuntimeEC = Plan.getConstantInt(Ty, EC.getKnownMinValue());
if (EC.isScalable()) {
VPValue *VScale = createNaryOp(VPInstruction::VScale, {}, Ty);
RuntimeEC = EC.getKnownMinValue() == 1
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 25bf49d..e5c3f17 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7752,8 +7752,7 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
if (CM.isPredicatedInst(I)) {
SmallVector<VPValue *> Ops(Operands);
VPValue *Mask = getBlockInMask(Builder.getInsertBlock());
- VPValue *One =
- Plan.getOrAddLiveIn(ConstantInt::get(I->getType(), 1u, false));
+ VPValue *One = Plan.getConstantInt(I->getType(), 1u);
auto *SafeRHS = Builder.createSelect(Mask, Ops[1], One, I->getDebugLoc());
Ops[1] = SafeRHS;
return new VPWidenRecipe(*I, Ops);
@@ -7806,11 +7805,10 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
}
case Instruction::ExtractValue: {
SmallVector<VPValue *> NewOps(Operands);
- Type *I32Ty = IntegerType::getInt32Ty(I->getContext());
auto *EVI = cast<ExtractValueInst>(I);
assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
unsigned Idx = EVI->getIndices()[0];
- NewOps.push_back(Plan.getOrAddLiveIn(ConstantInt::get(I32Ty, Idx, false)));
+ NewOps.push_back(Plan.getConstantInt(32, Idx));
return new VPWidenRecipe(*I, NewOps);
}
};
@@ -8179,8 +8177,7 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
"Expected an ADD or SUB operation for predicated partial "
"reductions (because the neutral element in the mask is zero)!");
Cond = getBlockInMask(Builder.getInsertBlock());
- VPValue *Zero =
- Plan.getOrAddLiveIn(ConstantInt::get(Reduction->getType(), 0));
+ VPValue *Zero = Plan.getConstantInt(Reduction->getType(), 0);
BinOp = Builder.createSelect(Cond, BinOp, Zero, Reduction->getDebugLoc());
}
return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
@@ -8643,7 +8640,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
} else if (PhiR->isInLoop() && Kind == RecurKind::AddChainWithSubs &&
CurrentLinkI->getOpcode() == Instruction::Sub) {
Type *PhiTy = PhiR->getUnderlyingValue()->getType();
- auto *Zero = Plan->getOrAddLiveIn(ConstantInt::get(PhiTy, 0));
+ auto *Zero = Plan->getConstantInt(PhiTy, 0);
VPWidenRecipe *Sub = new VPWidenRecipe(
Instruction::Sub, {Zero, CurrentLink->getOperand(1)}, {},
VPIRMetadata(), CurrentLinkI->getDebugLoc());
@@ -8857,8 +8854,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
ToDelete.push_back(Select);
// Convert the reduction phi to operate on bools.
- PhiR->setOperand(0, Plan->getOrAddLiveIn(ConstantInt::getFalse(
- OrigLoop->getHeader()->getContext())));
+ PhiR->setOperand(0, Plan->getFalse());
continue;
}
@@ -8880,9 +8876,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
unsigned ScaleFactor =
RecipeBuilder.getScalingForReduction(RdxDesc.getLoopExitInstr())
.value_or(1);
- Type *I32Ty = IntegerType::getInt32Ty(PhiTy->getContext());
- auto *ScaleFactorVPV =
- Plan->getOrAddLiveIn(ConstantInt::get(I32Ty, ScaleFactor));
+ auto *ScaleFactorVPV = Plan->getConstantInt(32, ScaleFactor);
VPValue *StartV = PHBuilder.createNaryOp(
VPInstruction::ReductionStartVector,
{PhiR->getStartValue(), Iden, ScaleFactorVPV},
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 1504acf..08c9c15 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -4393,15 +4393,25 @@ public:
}
/// Return a VPValue wrapping i1 true.
- VPValue *getTrue() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getTrue(Ctx));
- }
+ VPValue *getTrue() { return getConstantInt(1, 1); }
/// Return a VPValue wrapping i1 false.
- VPValue *getFalse() {
- LLVMContext &Ctx = getContext();
- return getOrAddLiveIn(ConstantInt::getFalse(Ctx));
+ VPValue *getFalse() { return getConstantInt(1, 0); }
+
+ /// Return a VPValue wrapping a ConstantInt with the given type and value.
+ VPValue *getConstantInt(Type *Ty, uint64_t Val, bool IsSigned = false) {
+ return getOrAddLiveIn(ConstantInt::get(Ty, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given bitwidth and value.
+ VPValue *getConstantInt(unsigned BitWidth, uint64_t Val,
+ bool IsSigned = false) {
+ return getConstantInt(APInt(BitWidth, Val, IsSigned));
+ }
+
+ /// Return a VPValue wrapping a ConstantInt with the given APInt value.
+ VPValue *getConstantInt(const APInt &Val) {
+ return getOrAddLiveIn(ConstantInt::get(getContext(), Val));
}
/// Return the live-in VPValue for \p V, if there is one or nullptr otherwise.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
index 65688a3..1a66d20 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanConstruction.cpp
@@ -612,8 +612,7 @@ void VPlanTransforms::addMiddleCheck(VPlan &Plan,
if (!RequiresScalarEpilogueCheck)
Cmp = Plan.getFalse();
else if (TailFolded)
- Cmp = Plan.getOrAddLiveIn(
- ConstantInt::getTrue(IntegerType::getInt1Ty(Plan.getContext())));
+ Cmp = Plan.getTrue();
else
Cmp = Builder.createICmp(CmpInst::ICMP_EQ, Plan.getTripCount(),
&Plan.getVectorTripCount(), LatchDL, "cmp.n");
@@ -712,8 +711,8 @@ void VPlanTransforms::addMinimumIterationCheck(
// additional overflow check is required before entering the vector loop.
// Get the maximum unsigned value for the type.
- VPValue *MaxUIntTripCount = Plan.getOrAddLiveIn(ConstantInt::get(
- TripCountTy, cast<IntegerType>(TripCountTy)->getMask()));
+ VPValue *MaxUIntTripCount =
+ Plan.getConstantInt(cast<IntegerType>(TripCountTy)->getMask());
VPValue *DistanceToMax = Builder.createNaryOp(
Instruction::Sub, {MaxUIntTripCount, TripCountVPV},
DebugLoc::getUnknown());
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index d491d56..6a8231b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -699,8 +699,7 @@ static void legalizeAndOptimizeInductions(VPlan &Plan) {
continue;
const InductionDescriptor &ID = PtrIV->getInductionDescriptor();
- VPValue *StartV =
- Plan.getOrAddLiveIn(ConstantInt::get(ID.getStep()->getType(), 0));
+ VPValue *StartV = Plan.getConstantInt(ID.getStep()->getType(), 0);
VPValue *StepV = PtrIV->getOperand(1);
VPScalarIVStepsRecipe *Steps = createScalarIVSteps(
Plan, InductionDescriptor::IK_IntInduction, Instruction::Add, nullptr,
@@ -836,7 +835,7 @@ static VPValue *optimizeEarlyExitInductionUser(VPlan &Plan,
// changed it means the exit is using the incremented value, so we need to
// add the step.
if (Incoming != WideIV) {
- VPValue *One = Plan.getOrAddLiveIn(ConstantInt::get(CanonicalIVType, 1));
+ VPValue *One = Plan.getConstantInt(CanonicalIVType, 1);
EndValue = B.createNaryOp(Instruction::Add, {EndValue, One}, DL);
}
@@ -882,7 +881,7 @@ static VPValue *optimizeLatchExitInductionUser(
return B.createNaryOp(Instruction::Sub, {EndValue, Step}, {}, "ind.escape");
if (ScalarTy->isPointerTy()) {
Type *StepTy = TypeInfo.inferScalarType(Step);
- auto *Zero = Plan.getOrAddLiveIn(ConstantInt::get(StepTy, 0));
+ auto *Zero = Plan.getConstantInt(StepTy, 0);
return B.createPtrAdd(EndValue,
B.createNaryOp(Instruction::Sub, {Zero, Step}),
DebugLoc::getUnknown(), "ind.escape");
@@ -1574,9 +1573,9 @@ static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan,
continue;
// Update IV operands and comparison bound to use new narrower type.
- auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0));
+ auto *NewStart = Plan.getConstantInt(NewIVTy, 0);
WideIV->setStartValue(NewStart);
- auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1));
+ auto *NewStep = Plan.getConstantInt(NewIVTy, 1);
WideIV->setStepValue(NewStep);
auto *NewBTC = new VPWidenCastRecipe(
@@ -1695,8 +1694,7 @@ static bool tryToReplaceALMWithWideALM(VPlan &Plan, ElementCount VF,
// When using wide lane masks, the return type of the get.active.lane.mask
// intrinsic is VF x UF (last operand).
- VPValue *ALMMultiplier =
- Plan.getOrAddLiveIn(ConstantInt::get(IntegerType::getInt64Ty(Ctx), UF));
+ VPValue *ALMMultiplier = Plan.getConstantInt(64, UF);
EntryALM->setOperand(2, ALMMultiplier);
LoopALM->setOperand(2, ALMMultiplier);
@@ -2403,7 +2401,7 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Create the active lane mask instruction in the VPlan preheader.
VPValue *ALMMultiplier =
- Plan.getOrAddLiveIn(ConstantInt::get(TopRegion->getCanonicalIVType(), 1));
+ Plan.getConstantInt(TopRegion->getCanonicalIVType(), 1);
auto *EntryALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{EntryIncrement, TC, ALMMultiplier}, DL,
"active.lane.mask.entry");
@@ -2790,8 +2788,7 @@ void VPlanTransforms::addExplicitVectorLength(
if (MaxSafeElements) {
// Support for MaxSafeDist for correct loop emission.
- VPValue *AVLSafe =
- Plan.getOrAddLiveIn(ConstantInt::get(CanIVTy, *MaxSafeElements));
+ VPValue *AVLSafe = Plan.getConstantInt(CanIVTy, *MaxSafeElements);
VPValue *Cmp = Builder.createICmp(ICmpInst::ICMP_ULT, AVL, AVLSafe);
AVL = Builder.createSelect(Cmp, AVL, AVLSafe, DebugLoc::getUnknown(),
"safe_avl");
@@ -2904,9 +2901,8 @@ void VPlanTransforms::canonicalizeEVLLoops(VPlan &Plan) {
Type *AVLTy = VPTypeAnalysis(Plan).inferScalarType(AVLNext);
VPBuilder Builder(LatchExitingBr);
- VPValue *Cmp =
- Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
- Plan.getOrAddLiveIn(ConstantInt::getNullValue(AVLTy)));
+ VPValue *Cmp = Builder.createICmp(CmpInst::ICMP_EQ, AVLNext,
+ Plan.getConstantInt(AVLTy, 0));
Builder.createNaryOp(VPInstruction::BranchOnCond, Cmp);
LatchExitingBr->eraseFromParent();
}
@@ -2930,8 +2926,7 @@ void VPlanTransforms::replaceSymbolicStrides(
// Only handle constant strides for now.
continue;
- auto *CI =
- Plan.getOrAddLiveIn(ConstantInt::get(Stride->getType(), *StrideConst));
+ auto *CI = Plan.getConstantInt(*StrideConst);
if (VPValue *StrideVPV = Plan.getLiveIn(StrideV))
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
@@ -2946,7 +2941,7 @@ void VPlanTransforms::replaceSymbolicStrides(
unsigned BW = U->getType()->getScalarSizeInBits();
APInt C =
isa<SExtInst>(U) ? StrideConst->sext(BW) : StrideConst->zext(BW);
- VPValue *CI = Plan.getOrAddLiveIn(ConstantInt::get(U->getType(), C));
+ VPValue *CI = Plan.getConstantInt(C);
StrideVPV->replaceUsesWithIf(CI, CanUseVersionedStride);
}
RewriteMap[StrideV] = PSE.getSCEV(StrideV);
@@ -3125,8 +3120,7 @@ void VPlanTransforms::createInterleaveGroups(
DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
IG->getIndex(IRInsertPos),
/*IsSigned=*/true);
- VPValue *OffsetVPV =
- Plan.getOrAddLiveIn(ConstantInt::get(Plan.getContext(), -Offset));
+ VPValue *OffsetVPV = Plan.getConstantInt(-Offset);
VPBuilder B(InsertPos);
Addr = B.createNoWrapPtrAdd(InsertPos->getAddr(), OffsetVPV, NW);
}
@@ -3867,8 +3861,7 @@ void VPlanTransforms::materializeBackedgeTakenCount(VPlan &Plan,
VPBuilder Builder(VectorPH, VectorPH->begin());
auto *TCTy = VPTypeAnalysis(Plan).inferScalarType(Plan.getTripCount());
auto *TCMO = Builder.createNaryOp(
- Instruction::Sub,
- {Plan.getTripCount(), Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))},
+ Instruction::Sub, {Plan.getTripCount(), Plan.getConstantInt(TCTy, 1)},
DebugLoc::getCompilerGenerated(), "trip.count.minus.1");
BTC->replaceAllUsesWith(TCMO);
}
@@ -3993,9 +3986,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (TailByMasking) {
TC = Builder.createNaryOp(
Instruction::Add,
- {TC, Builder.createNaryOp(
- Instruction::Sub,
- {Step, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 1))})},
+ {TC, Builder.createNaryOp(Instruction::Sub,
+ {Step, Plan.getConstantInt(TCTy, 1)})},
DebugLoc::getCompilerGenerated(), "n.rnd.up");
}
@@ -4017,8 +4009,8 @@ void VPlanTransforms::materializeVectorTripCount(VPlan &Plan,
if (RequiresScalarEpilogue) {
assert(!TailByMasking &&
"requiring scalar epilogue is not supported with fail folding");
- VPValue *IsZero = Builder.createICmp(
- CmpInst::ICMP_EQ, R, Plan.getOrAddLiveIn(ConstantInt::get(TCTy, 0)));
+ VPValue *IsZero =
+ Builder.createICmp(CmpInst::ICMP_EQ, R, Plan.getConstantInt(TCTy, 0));
R = Builder.createSelect(IsZero, Step, R);
}
@@ -4056,7 +4048,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
}
VF.replaceAllUsesWith(RuntimeVF);
- VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
+ VPValue *UF = Plan.getConstantInt(TCTy, Plan.getUF());
VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
VFxUF.replaceAllUsesWith(MulByUF);
}
@@ -4346,7 +4338,7 @@ void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
} else {
Inc->setOperand(1, UF);
Plan.getVF().replaceAllUsesWith(
- Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
+ Plan.getConstantInt(CanIV->getScalarType(), 1));
}
removeDeadRecipes(Plan);
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
index f15113c..d6a0028 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp
@@ -68,9 +68,9 @@ class UnrollState {
void unrollWidenInductionByUF(VPWidenInductionRecipe *IV,
VPBasicBlock::iterator InsertPtForPhi);
- VPValue *getConstantVPV(unsigned Part) {
+ VPValue *getConstantInt(unsigned Part) {
Type *CanIVIntTy = Plan.getVectorLoopRegion()->getCanonicalIVType();
- return Plan.getOrAddLiveIn(ConstantInt::get(CanIVIntTy, Part));
+ return Plan.getConstantInt(CanIVIntTy, Part);
}
public:
@@ -137,7 +137,7 @@ void UnrollState::unrollReplicateRegionByUF(VPRegionBlock *VPR) {
for (const auto &[PartIR, Part0R] : zip(*PartIVPBB, *Part0VPBB)) {
remapOperands(&PartIR, Part);
if (auto *ScalarIVSteps = dyn_cast<VPScalarIVStepsRecipe>(&PartIR)) {
- ScalarIVSteps->addOperand(getConstantVPV(Part));
+ ScalarIVSteps->addOperand(getConstantInt(Part));
}
addRecipeForPart(&Part0R, &PartIR, Part);
@@ -249,7 +249,7 @@ void UnrollState::unrollHeaderPHIByUF(VPHeaderPHIRecipe *R,
for (unsigned Part = 1; Part != UF; ++Part)
VPV2Parts[VPI][Part - 1] = StartV;
}
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
} else {
assert(isa<VPActiveLaneMaskPHIRecipe>(R) &&
"unexpected header phi recipe not needing unrolled part");
@@ -318,7 +318,7 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) {
VPVectorPointerRecipe, VPVectorEndPointerRecipe>(Copy) ||
match(Copy,
m_VPInstruction<VPInstruction::CanonicalIVIncrementForPart>()))
- Copy->addOperand(getConstantVPV(Part));
+ Copy->addOperand(getConstantInt(Part));
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe>(R))
Copy->setOperand(0, R.getOperand(0));
@@ -474,8 +474,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
if (LaneDefs != Def2LaneDefs.end())
return LaneDefs->second[Lane.getKnownLane()];
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
return Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
}
@@ -509,8 +508,7 @@ cloneForLane(VPlan &Plan, VPBuilder &Builder, Type *IdxTy,
cast<VPInstruction>(Op)->getOperand(Lane.getKnownLane()));
continue;
}
- VPValue *Idx =
- Plan.getOrAddLiveIn(ConstantInt::get(IdxTy, Lane.getKnownLane()));
+ VPValue *Idx = Plan.getConstantInt(IdxTy, Lane.getKnownLane());
VPValue *Ext = Builder.createNaryOp(Instruction::ExtractElement, {Op, Idx});
NewOps.push_back(Ext);
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 02d0e52..6facdfd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -104,109 +104,110 @@ define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
ret <4 x i32> %res
}
-define amdgpu_cs i16 @abs_vgpr_i16(i16 %arg) {
+define i16 @abs_vgpr_i16(i16 %arg) {
; GFX6-LABEL: abs_vgpr_i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_max_i16_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u16 v1, 0, v0
; GFX10-NEXT: v_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i16 @llvm.abs.i16(i16 %arg, i1 false)
ret i16 %res
}
-define amdgpu_cs i32 @abs_vgpr_i32(i32 %arg) {
+define i32 @abs_vgpr_i32(i32 %arg) {
; GFX6-LABEL: abs_vgpr_i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v1, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v1, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
; GFX10-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_sub_nc_u32_e32 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_max_i32_e32 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
ret i32 %res
}
-define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
+define i64 @abs_vgpr_i64(i64 %arg) {
; GFX6-LABEL: abs_vgpr_i64:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX6-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX6-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; GFX8-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX8-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX10-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_i64:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-NEXT: v_mov_b32_e32 v3, v2
@@ -214,17 +215,15 @@ define amdgpu_cs i64 @abs_vgpr_i64(i64 %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_xor_b32_e32 v0, v0, v2
; GFX1250-NEXT: v_xor_b32_e32 v1, v1, v2
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call i64 @llvm.abs.i64(i64 %arg, i1 false)
ret i64 %res
}
-define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
+define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-LABEL: abs_vgpr_v4i32:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v0
; GFX6-NEXT: v_max_i32_e32 v0, v0, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
@@ -233,14 +232,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX6-NEXT: v_max_i32_e32 v2, v2, v4
; GFX6-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
; GFX6-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: v_readfirstlane_b32 s3, v3
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v0
; GFX8-NEXT: v_max_i32_e32 v0, v0, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v1
@@ -249,14 +245,11 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX8-NEXT: v_max_i32_e32 v2, v2, v4
; GFX8-NEXT: v_sub_u32_e32 v4, vcc, 0, v3
; GFX8-NEXT: v_max_i32_e32 v3, v3, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: v_readfirstlane_b32 s3, v3
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v4i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_sub_nc_u32_e32 v4, 0, v0
; GFX10-NEXT: v_sub_nc_u32_e32 v5, 0, v1
; GFX10-NEXT: v_sub_nc_u32_e32 v6, 0, v2
@@ -265,14 +258,12 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX10-NEXT: v_max_i32_e32 v1, v1, v5
; GFX10-NEXT: v_max_i32_e32 v2, v2, v6
; GFX10-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: v_readfirstlane_b32 s3, v3
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v4i32:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_dual_sub_nc_u32 v4, 0, v0 :: v_dual_sub_nc_u32 v5, 0, v1
; GFX1250-NEXT: v_dual_sub_nc_u32 v6, 0, v2 :: v_dual_sub_nc_u32 v7, 0, v3
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -281,13 +272,7 @@ define amdgpu_cs <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX1250-NEXT: v_max_i32_e32 v2, v2, v6
; GFX1250-NEXT: v_max_i32_e32 v3, v3, v7
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: v_readfirstlane_b32 s3, v3
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
ret <4 x i32> %res
}
@@ -304,44 +289,43 @@ define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
ret <2 x i8> %res
}
-define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
+define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v2i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_sdwa v3, v2, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_sub_nc_u16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -350,10 +334,7 @@ define amdgpu_cs <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
ret <2 x i8> %res
}
@@ -372,9 +353,10 @@ define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
ret <3 x i8> %res
}
-define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
+define <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-LABEL: abs_vgpr_v3i8:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
@@ -384,13 +366,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i8:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_sdwa v4, v3, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -398,13 +378,11 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
; GFX8-NEXT: v_max_i16_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
; GFX8-NEXT: v_max_i16_sdwa v2, sext(v2), v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: v_readfirstlane_b32 s2, v2
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i8:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX10-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX10-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -414,13 +392,12 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX10-NEXT: v_max_i16 v0, v0, v3
; GFX10-NEXT: v_max_i16 v1, v1, v4
; GFX10-NEXT: v_max_i16 v2, v2, v5
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: v_readfirstlane_b32 s2, v2
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i8:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 8
; GFX1250-NEXT: v_bfe_i32 v1, v1, 0, 8
; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 8
@@ -433,12 +410,7 @@ define amdgpu_cs <3 x i8> @abs_vgpr_v3i8(<3 x i8> %arg) {
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX1250-NEXT: v_max_i16 v1, v1, v4
; GFX1250-NEXT: v_max_i16 v2, v2, v5
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX1250-NEXT: v_readfirstlane_b32 s2, v2
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
ret <3 x i8> %res
}
@@ -485,44 +457,44 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
ret <2 x i16> %res
}
-define amdgpu_cs <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
+define <2 x i16> @abs_vgpr_v2i16(<2 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v2i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX6-NEXT: v_max_i32_e32 v0, v0, v2
; GFX6-NEXT: v_sub_i32_e32 v2, vcc, 0, v1
; GFX6-NEXT: v_max_i32_e32 v1, v1, v2
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v2i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, 0
; GFX8-NEXT: v_sub_u16_e32 v1, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_max_i16_e32 v1, v0, v1
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v2i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v1, 0, v0
; GFX10-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v2i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v1, 0, v0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v1
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <2 x i16> @llvm.abs.v2i16(<2 x i16> %arg, i1 false)
ret <2 x i16> %res
}
@@ -576,9 +548,10 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
ret <3 x i16> %res
}
-define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
+define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-LABEL: abs_vgpr_v3i16:
; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v0
; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
@@ -588,13 +561,11 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX6-NEXT: v_max_i32_e32 v1, v1, v3
; GFX6-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
; GFX6-NEXT: v_max_i32_e32 v2, v2, v3
-; GFX6-NEXT: v_readfirstlane_b32 s0, v0
-; GFX6-NEXT: v_readfirstlane_b32 s1, v1
-; GFX6-NEXT: v_readfirstlane_b32 s2, v2
-; GFX6-NEXT: ; return to shader part epilog
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: abs_vgpr_v3i16:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, 0
; GFX8-NEXT: v_sub_u16_e32 v2, 0, v0
; GFX8-NEXT: v_sub_u16_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
@@ -603,31 +574,27 @@ define amdgpu_cs <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
; GFX8-NEXT: v_max_i16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
; GFX8-NEXT: v_max_i16_e32 v1, v1, v4
-; GFX8-NEXT: v_readfirstlane_b32 s0, v0
-; GFX8-NEXT: v_readfirstlane_b32 s1, v1
-; GFX8-NEXT: ; return to shader part epilog
+; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: abs_vgpr_v3i16:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX10-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX10-NEXT: v_pk_max_i16 v0, v0, v2
; GFX10-NEXT: v_max_i16 v1, v1, v3
-; GFX10-NEXT: v_readfirstlane_b32 s0, v0
-; GFX10-NEXT: v_readfirstlane_b32 s1, v1
-; GFX10-NEXT: ; return to shader part epilog
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX1250-LABEL: abs_vgpr_v3i16:
; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
; GFX1250-NEXT: v_pk_sub_i16 v2, 0, v0
; GFX1250-NEXT: v_sub_nc_u16 v3, 0, v1
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX1250-NEXT: v_pk_max_i16 v0, v0, v2
; GFX1250-NEXT: v_max_i16 v1, v1, v3
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT: v_readfirstlane_b32 s0, v0
-; GFX1250-NEXT: v_readfirstlane_b32 s1, v1
-; GFX1250-NEXT: ; return to shader part epilog
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
ret <3 x i16> %res
}
diff --git a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
index 1b8e126..a1381ec 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/wait-xcnt.mir
@@ -945,7 +945,6 @@ body: |
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
...
-# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
---
name: wait_kmcnt_with_outstanding_vmem_2
tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
; GCN-NEXT: {{ $}}
; GCN-NEXT: S_WAIT_KMCNT 0
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 0
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
bb.0:
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -986,6 +986,180 @@ body: |
...
---
+name: wait_kmcnt_and_wait_loadcnt
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_LOADCNT 0
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+...
+
+---
+name: implicit_handling_of_pending_vmem_group
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $sgpr2
+ $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: pending_vmem_event_between_block
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: pending_vmem_event_between_block
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_KMCNT 0
+ ; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
+ ; GCN-NEXT: S_WAIT_XCNT 1
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ $sgpr2 = S_MOV_B32 $sgpr2
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
+name: flushing_vmem_cnt_on_block_entry
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+body: |
+ ; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
+ ; GCN: bb.0:
+ ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.1:
+ ; GCN-NEXT: successors: %bb.2(0x80000000)
+ ; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ ; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: bb.2:
+ ; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: S_WAIT_XCNT 0
+ ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ ; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
+ bb.0:
+ liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
+ $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
+ S_CBRANCH_SCC1 %bb.2, implicit $scc
+ bb.1:
+ liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
+ $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
+ $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
+ bb.2:
+ liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
+ $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ $vgpr2 = V_MOV_B32_e32 0, implicit $exec
+ $sgpr0 = S_MOV_B32 $sgpr0
+...
+
+---
name: wait_loadcnt_with_outstanding_smem
tracksRegLiveness: true
machineFunctionInfo:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
index ba2118f..b3155c9 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v4i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v32i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v32i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvxori.b $xr0, $xr0, 255
+; CHECK-NEXT: xvclz.b $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <32 x i8>, ptr %src
+ %neg = xor <32 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %neg, i1 false)
+ store <32 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v16i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.h $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i16>, ptr %src
+ %neg = xor <16 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %neg, i1 false)
+ store <16 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.w $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i32>, ptr %src
+ %neg = xor <8 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %neg, i1 false)
+ store <8 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a0, 0
+; CHECK-NEXT: xvrepli.b $xr1, -1
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvclz.d $xr0, $xr0
+; CHECK-NEXT: xvst $xr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i64>, ptr %src
+ %neg = xor <4 x i64> %v, <i64 -1, i64 -1, i64 -1, i64 -1>
+ %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %neg, i1 false)
+ store <4 x i64> %res, ptr %dst
+ ret void
+}
+
declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
index a9a38e8..6ac7d51 100644
--- a/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
+++ b/llvm/test/CodeGen/LoongArch/lsx/ctpop-ctlz.ll
@@ -106,6 +106,69 @@ define void @ctlz_v2i64(ptr %src, ptr %dst) nounwind {
ret void
}
+define void @not_ctlz_v16i8(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vxori.b $vr0, $vr0, 255
+; CHECK-NEXT: vclz.b $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <16 x i8>, ptr %src
+ %neg = xor <16 x i8> %v, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+ %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %neg, i1 false)
+ store <16 x i8> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v8i16(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.h $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <8 x i16>, ptr %src
+ %neg = xor <8 x i16> %v, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+ %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %neg, i1 false)
+ store <8 x i16> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v4i32(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.w $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <4 x i32>, ptr %src
+ %neg = xor <4 x i32> %v, <i32 -1, i32 -1, i32 -1, i32 -1>
+ %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %neg, i1 false)
+ store <4 x i32> %res, ptr %dst
+ ret void
+}
+
+define void @not_ctlz_v2i64(ptr %src, ptr %dst) nounwind {
+; CHECK-LABEL: not_ctlz_v2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vld $vr0, $a0, 0
+; CHECK-NEXT: vrepli.b $vr1, -1
+; CHECK-NEXT: vxor.v $vr0, $vr0, $vr1
+; CHECK-NEXT: vclz.d $vr0, $vr0
+; CHECK-NEXT: vst $vr0, $a1, 0
+; CHECK-NEXT: ret
+ %v = load <2 x i64>, ptr %src
+ %neg = xor <2 x i64> %v, <i64 -1, i64 -1>
+ %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %neg, i1 false)
+ store <2 x i64> %res, ptr %dst
+ ret void
+}
+
declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
diff --git a/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
new file mode 100644
index 0000000..9a806a1
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/sink-fold-addi.ll
@@ -0,0 +1,758 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA32 %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx --verify-machineinstrs < %s \
+; RUN: | FileCheck --check-prefix=LA64 %s
+
+%struct.S = type { i64, i64, i8 }
+%struct.F = type { float, double, float }
+%struct.V = type { <4 x i32>, <4 x i32>, <16 x i16> }
+
+define void @sink_fold_i64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_i64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s5, $sp, 16 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s6, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB0_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: move $s5, $zero
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB0_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: ld.w $a0, $s2, 4
+; LA32-NEXT: ld.w $a1, $s2, 0
+; LA32-NEXT: add.w $a0, $a0, $s6
+; LA32-NEXT: add.w $s3, $a1, $s3
+; LA32-NEXT: sltu $a1, $s3, $a1
+; LA32-NEXT: addi.w $s4, $s4, 1
+; LA32-NEXT: sltui $a2, $s4, 1
+; LA32-NEXT: add.w $s5, $s5, $a2
+; LA32-NEXT: xor $a2, $s4, $s1
+; LA32-NEXT: xor $a3, $s5, $s0
+; LA32-NEXT: or $a2, $a2, $a3
+; LA32-NEXT: add.w $s6, $a0, $a1
+; LA32-NEXT: bnez $a2, .LBB0_2
+; LA32-NEXT: b .LBB0_4
+; LA32-NEXT: .LBB0_3:
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s6, $zero
+; LA32-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA32-NEXT: st.w $s3, $s2, 0
+; LA32-NEXT: st.w $s6, $s2, 4
+; LA32-NEXT: ld.w $s6, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s5, $sp, 16 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_i64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s2, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB0_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB0_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: ld.d $a0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: add.d $s2, $a0, $s2
+; LA64-NEXT: bnez $s0, .LBB0_2
+; LA64-NEXT: b .LBB0_4
+; LA64-NEXT: .LBB0_3:
+; LA64-NEXT: move $s2, $zero
+; LA64-NEXT: .LBB0_4: # %for.cond.cleanup
+; LA64-NEXT: st.d $s2, $s1, 0
+; LA64-NEXT: ld.d $s2, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 1
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load i64, ptr %y
+ %add = add nsw i64 %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi i64 [ 0, %entry ], [ %add, %for.body ]
+ store i64 %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_f32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_f32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB1_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB1_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: fld.s $fa0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA32-NEXT: bnez $a0, .LBB1_2
+; LA32-NEXT: b .LBB1_4
+; LA32-NEXT: .LBB1_3:
+; LA32-NEXT: movgr2fr.w $fs0, $zero
+; LA32-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA32-NEXT: fst.s $fs0, $s2, 0
+; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_f32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB1_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB1_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: fld.s $fa0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: fadd.s $fs0, $fa0, $fs0
+; LA64-NEXT: bnez $s0, .LBB1_2
+; LA64-NEXT: b .LBB1_4
+; LA64-NEXT: .LBB1_3:
+; LA64-NEXT: movgr2fr.w $fs0, $zero
+; LA64-NEXT: .LBB1_4: # %for.cond.cleanup
+; LA64-NEXT: fst.s $fs0, $s1, 0
+; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 2
+ %cmp4 = icmp sgt i64 %n, 0
+ br i1 %cmp4, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %s.05 = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ call void @f(ptr %a)
+ %0 = load float, ptr %y
+ %add = fadd float %0, %s.05
+ %inc = add nuw nsw i64 %i.06, 1
+ %exitcond.not = icmp eq i64 %inc, %n
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %s.0.lcssa = phi float [ 0.0, %entry ], [ %add, %for.body ]
+ store float %s.0.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v4i32(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v4i32:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB2_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB2_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vld $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB2_2
+; LA32-NEXT: b .LBB2_4
+; LA32-NEXT: .LBB2_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA32-NEXT: vst $vr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v4i32:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $a1, .LBB2_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB2_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vld $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.w $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB2_2
+; LA64-NEXT: b .LBB2_4
+; LA64-NEXT: .LBB2_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB2_4: # %for.cond.cleanup
+; LA64-NEXT: vst $vr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <4 x i32>, ptr %y
+ %addv = add <4 x i32> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x i32> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <4 x i32> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_v16i16(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_v16i16:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a0, $a0, 6
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 32
+; LA32-NEXT: bnez $a1, .LBB3_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB3_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvld $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB3_2
+; LA32-NEXT: b .LBB3_4
+; LA32-NEXT: .LBB3_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA32-NEXT: xvst $xr0, $s2, 0
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_v16i16:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: slli.d $a0, $a0, 6
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 32
+; LA64-NEXT: blez $a1, .LBB3_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB3_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvld $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvadd.h $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB3_2
+; LA64-NEXT: b .LBB3_4
+; LA64-NEXT: .LBB3_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB3_4: # %for.cond.cleanup
+; LA64-NEXT: xvst $xr0, $s1, 0
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.V, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %v = load <16 x i16>, ptr %y
+ %addv = add <16 x i16> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i16> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ store <16 x i16> %sum.lcssa, ptr %y
+ ret void
+}
+
+define void @sink_fold_extracti8(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extracti8:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -48
+; LA32-NEXT: st.w $ra, $sp, 44 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 40 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 36 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 32 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 28 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 24 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 20 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 16
+; LA32-NEXT: bnez $a1, .LBB4_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB4_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: vldrepl.b $vr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA32-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA32-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB4_2
+; LA32-NEXT: b .LBB4_4
+; LA32-NEXT: .LBB4_3:
+; LA32-NEXT: vrepli.b $vr0, 0
+; LA32-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA32-NEXT: vstelm.b $vr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 20 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 24 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 28 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 32 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 36 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 40 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 44 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 48
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extracti8:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -48
+; LA64-NEXT: st.d $ra, $sp, 40 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 32 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 24 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 16 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 16
+; LA64-NEXT: blez $s0, .LBB4_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB4_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: vst $vr0, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: vldrepl.b $vr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: vld $vr1, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: vadd.b $vr1, $vr0, $vr1
+; LA64-NEXT: vst $vr1, $sp, 0 # 16-byte Folded Spill
+; LA64-NEXT: vld $vr0, $sp, 0 # 16-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB4_2
+; LA64-NEXT: b .LBB4_4
+; LA64-NEXT: .LBB4_3:
+; LA64-NEXT: vrepli.b $vr0, 0
+; LA64-NEXT: .LBB4_4: # %for.cond.cleanup
+; LA64-NEXT: vstelm.b $vr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 16 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 24 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 32 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 40 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 48
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.S, ptr %a, i64 %k, i32 2
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load i8, ptr %y
+ %ins0 = insertelement <16 x i8> poison, i8 %e, i32 0
+ %v = shufflevector <16 x i8> %ins0, <16 x i8> poison, <16 x i32> zeroinitializer
+ %addv = add <16 x i8> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <16 x i8> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <16 x i8> %sum.lcssa, i32 1
+ store i8 %res, ptr %y
+ ret void
+}
+
+define void @sink_fold_extractf64(i64 %k, i64 %n, ptr %a) nounwind {
+; LA32-LABEL: sink_fold_extractf64:
+; LA32: # %bb.0: # %entry
+; LA32-NEXT: addi.w $sp, $sp, -80
+; LA32-NEXT: st.w $ra, $sp, 76 # 4-byte Folded Spill
+; LA32-NEXT: st.w $fp, $sp, 72 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s0, $sp, 68 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s1, $sp, 64 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s2, $sp, 60 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s3, $sp, 56 # 4-byte Folded Spill
+; LA32-NEXT: st.w $s4, $sp, 52 # 4-byte Folded Spill
+; LA32-NEXT: move $s0, $a3
+; LA32-NEXT: move $s1, $a2
+; LA32-NEXT: slli.w $a1, $a0, 4
+; LA32-NEXT: alsl.w $a0, $a0, $a1, 3
+; LA32-NEXT: add.w $a0, $a4, $a0
+; LA32-NEXT: sltui $a1, $a3, 1
+; LA32-NEXT: slti $a2, $a3, 0
+; LA32-NEXT: masknez $a2, $a2, $a1
+; LA32-NEXT: sltui $a3, $s1, 1
+; LA32-NEXT: maskeqz $a1, $a3, $a1
+; LA32-NEXT: or $a1, $a1, $a2
+; LA32-NEXT: addi.w $s2, $a0, 8
+; LA32-NEXT: bnez $a1, .LBB5_3
+; LA32-NEXT: # %bb.1: # %for.body.preheader
+; LA32-NEXT: move $fp, $a4
+; LA32-NEXT: move $s3, $zero
+; LA32-NEXT: move $s4, $zero
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .p2align 4, , 16
+; LA32-NEXT: .LBB5_2: # %for.body
+; LA32-NEXT: # =>This Inner Loop Header: Depth=1
+; LA32-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: move $a0, $fp
+; LA32-NEXT: bl f
+; LA32-NEXT: xvldrepl.d $xr0, $s2, 0
+; LA32-NEXT: addi.w $s3, $s3, 1
+; LA32-NEXT: sltui $a0, $s3, 1
+; LA32-NEXT: add.w $s4, $s4, $a0
+; LA32-NEXT: xor $a0, $s3, $s1
+; LA32-NEXT: xor $a1, $s4, $s0
+; LA32-NEXT: or $a0, $a0, $a1
+; LA32-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA32-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA32-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA32-NEXT: bnez $a0, .LBB5_2
+; LA32-NEXT: b .LBB5_4
+; LA32-NEXT: .LBB5_3:
+; LA32-NEXT: xvrepli.b $xr0, 0
+; LA32-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA32-NEXT: xvstelm.d $xr0, $s2, 0, 1
+; LA32-NEXT: ld.w $s4, $sp, 52 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s3, $sp, 56 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s2, $sp, 60 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s1, $sp, 64 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $s0, $sp, 68 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $fp, $sp, 72 # 4-byte Folded Reload
+; LA32-NEXT: ld.w $ra, $sp, 76 # 4-byte Folded Reload
+; LA32-NEXT: addi.w $sp, $sp, 80
+; LA32-NEXT: ret
+;
+; LA64-LABEL: sink_fold_extractf64:
+; LA64: # %bb.0: # %entry
+; LA64-NEXT: addi.d $sp, $sp, -80
+; LA64-NEXT: st.d $ra, $sp, 72 # 8-byte Folded Spill
+; LA64-NEXT: st.d $fp, $sp, 64 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s0, $sp, 56 # 8-byte Folded Spill
+; LA64-NEXT: st.d $s1, $sp, 48 # 8-byte Folded Spill
+; LA64-NEXT: move $s0, $a1
+; LA64-NEXT: slli.d $a1, $a0, 4
+; LA64-NEXT: alsl.d $a0, $a0, $a1, 3
+; LA64-NEXT: add.d $a0, $a2, $a0
+; LA64-NEXT: addi.d $s1, $a0, 8
+; LA64-NEXT: blez $s0, .LBB5_3
+; LA64-NEXT: # %bb.1: # %for.body.preheader
+; LA64-NEXT: move $fp, $a2
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .p2align 4, , 16
+; LA64-NEXT: .LBB5_2: # %for.body
+; LA64-NEXT: # =>This Inner Loop Header: Depth=1
+; LA64-NEXT: xvst $xr0, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: move $a0, $fp
+; LA64-NEXT: pcaddu18i $ra, %call36(f)
+; LA64-NEXT: jirl $ra, $ra, 0
+; LA64-NEXT: xvldrepl.d $xr0, $s1, 0
+; LA64-NEXT: addi.d $s0, $s0, -1
+; LA64-NEXT: xvld $xr1, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: xvfadd.d $xr1, $xr0, $xr1
+; LA64-NEXT: xvst $xr1, $sp, 16 # 32-byte Folded Spill
+; LA64-NEXT: xvld $xr0, $sp, 16 # 32-byte Folded Reload
+; LA64-NEXT: bnez $s0, .LBB5_2
+; LA64-NEXT: b .LBB5_4
+; LA64-NEXT: .LBB5_3:
+; LA64-NEXT: xvrepli.b $xr0, 0
+; LA64-NEXT: .LBB5_4: # %for.cond.cleanup
+; LA64-NEXT: xvstelm.d $xr0, $s1, 0, 1
+; LA64-NEXT: ld.d $s1, $sp, 48 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $s0, $sp, 56 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $fp, $sp, 64 # 8-byte Folded Reload
+; LA64-NEXT: ld.d $ra, $sp, 72 # 8-byte Folded Reload
+; LA64-NEXT: addi.d $sp, $sp, 80
+; LA64-NEXT: ret
+entry:
+ %y = getelementptr inbounds %struct.F, ptr %a, i64 %k, i32 1
+ %cmp = icmp sgt i64 %n, 0
+ br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body: ; preds = %entry, %for.body
+ %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+ %sum.0 = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ call void @f(ptr %a)
+ %e = load double, ptr %y
+ %ins0 = insertelement <4 x double> poison, double %e, i32 0
+ %v = shufflevector <4 x double> %ins0, <4 x double> poison, <4 x i32> zeroinitializer
+ %addv = fadd <4 x double> %v, %sum.0
+ %inc = add nuw nsw i64 %i.0, 1
+ %exitcond = icmp eq i64 %inc, %n
+ br i1 %exitcond, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %entry
+ %sum.lcssa = phi <4 x double> [ zeroinitializer, %entry ], [ %addv, %for.body ]
+ %res = extractelement <4 x double> %sum.lcssa, i32 1
+ store double %res, ptr %y
+ ret void
+}
+
+declare void @f(ptr)
diff --git a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
index 9aefa90..c50a0fb3 100644
--- a/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-stackmap.ll
@@ -7,11 +7,11 @@
; CHECK-NEXT: .byte 0
; CHECK-NEXT: .half 0
; Num Functions
-; CHECK-NEXT: .word 12
+; CHECK-NEXT: .word 13
; Num LargeConstants
-; CHECK-NEXT: .word 2
+; CHECK-NEXT: .word 3
; Num Callsites
-; CHECK-NEXT: .word 16
+; CHECK-NEXT: .word 17
; Functions and stack size
; CHECK-NEXT: .quad constantargs
@@ -50,10 +50,14 @@
; CHECK-NEXT: .quad needsStackRealignment
; CHECK-NEXT: .quad -1
; CHECK-NEXT: .quad 1
+; CHECK-NEXT: .quad floats
+; CHECK-NEXT: .quad 32
+; CHECK-NEXT: .quad 1
; Num LargeConstants
; CHECK-NEXT: .quad 4294967295
; CHECK-NEXT: .quad 4294967296
+; CHECK-NEXT: .quad 4609434218613702656
; Constant arguments
;
@@ -379,6 +383,104 @@ define void @needsStackRealignment() {
}
declare void @escape_values(...)
+; CHECK-LABEL: .word .L{{.*}}-floats
+; CHECK-NEXT: .half 0
+; Num Locations
+; CHECK-NEXT: .half 12
+; Loc 0: constant float as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 1: constant double as large constant integer
+; CHECK-NEXT: .byte 5
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 2: constant half as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 3: constant bfloat as constant integer
+; CHECK-NEXT: .byte 4
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 4: float value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 10
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 5: double value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 11
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 6: half value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 12
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 7: bfloat value in X register
+; CHECK-NEXT: .byte 1
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 13
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 8: float on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 9: double on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 10: half on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+; Loc 11: bfloat on stack
+; CHECK-NEXT: .byte 2
+; CHECK-NEXT: .byte 0
+; CHECK-NEXT: .half 8
+; CHECK-NEXT: .half 2
+; CHECK-NEXT: .half 0
+; CHECK-NEXT: .word
+define void @floats(float %f, double %g, half %h, bfloat %i) {
+ %ff = alloca float
+ %gg = alloca double
+ %hh = alloca half
+ %ii = alloca bfloat
+ call void (i64, i32, ...) @llvm.experimental.stackmap(i64 888, i32 0, float 1.25,
+ double 1.5, half 1.5, bfloat 1.5, float %f, double %g, half %h, bfloat %i, ptr %ff, ptr %gg, ptr %hh, ptr %ii)
+ ret void
+}
+
declare void @llvm.experimental.stackmap(i64, i32, ...)
declare void @llvm.experimental.patchpoint.void(i64, i32, ptr, i32, ...)
declare i64 @llvm.experimental.patchpoint.i64(i64, i32, ptr, i32, ...)
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index dffe900..8007d9d 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -203,14 +203,24 @@ define i1 @init_eq_i32(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $32, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %edx
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB5_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: .LBB5_2:
+; X86-NEXT: andl 4(%eax), %esi
+; X86-NEXT: andl (%eax), %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: setne %al
+; X86-NEXT: popl %esi
; X86-NEXT: retl
;
; X64-LABEL: test_ne_i64:
@@ -232,20 +242,38 @@ define i1 @test_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB6_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB6_2:
+; X86-NEXT: movl (%edx), %ecx
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl %eax, %ebp
+; X86-NEXT: xorl %esi, %edi
+; X86-NEXT: xorl %eax, %ecx
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: setne %al
+; X86-NEXT: movl %ecx, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: complement_ne_i64:
@@ -272,20 +300,40 @@ define i1 @complement_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB7_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: .LBB7_2:
+; X86-NEXT: movl (%edx), %eax
+; X86-NEXT: movl 4(%edx), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: notl %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: andl %esi, %ebp
+; X86-NEXT: notl %esi
+; X86-NEXT: andl %ecx, %edi
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: sete %al
+; X86-NEXT: movl %esi, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: reset_eq_i64:
@@ -313,20 +361,38 @@ define i1 @reset_eq_i64(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB8_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: .LBB8_2:
+; X86-NEXT: movl (%edx), %ecx
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: andl %esi, %ebx
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: andl %eax, %ebp
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %ebx, %ebp
+; X86-NEXT: setne %al
+; X86-NEXT: movl %ecx, (%edx)
+; X86-NEXT: movl %edi, 4(%edx)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; X64-LABEL: set_ne_i64:
@@ -353,26 +419,52 @@ define i1 @set_ne_i64(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i64:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl $1, %eax
+; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: xorl %edi, %edi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: testb $32, %cl
+; X86-NEXT: je .LBB9_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl $0, %eax
+; X86-NEXT: .LBB9_2:
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: notl %ebx
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: notl %ebp
+; X86-NEXT: je .LBB9_4
+; X86-NEXT: # %bb.3:
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: .LBB9_4:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $32, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: andl %ecx, %ebx
; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl (%edi), %ecx
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: andl %ecx, %ebp
+; X86-NEXT: orl %esi, %ebp
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %ebp, (%edi)
+; X86-NEXT: movl %ebx, 4(%edi)
+; X86-NEXT: sete %al
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i64:
@@ -424,25 +516,101 @@ define i1 @init_eq_i64(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i128:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $96, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $48, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, (%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %esi
+; X86-NEXT: movl 24(%esp,%esi), %edi
+; X86-NEXT: movl 28(%esp,%esi), %eax
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 16(%esp,%esi), %edx
+; X86-NEXT: movl 20(%esp,%esi), %esi
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: andl 8(%ebx), %edi
+; X86-NEXT: andl (%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: andl 12(%ebx), %eax
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl $96, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: cmovneq %rsi, %rax
+; SSE-NEXT: andq 8(%rdi), %rdx
+; SSE-NEXT: andq (%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: setne %al
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: movl $1, %edx
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: shldq %cl, %rdx, %rsi
+; AVX2-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %rdx, %rsi
+; AVX2-NEXT: cmovneq %rax, %rdx
+; AVX2-NEXT: andq 8(%rdi), %rsi
+; AVX2-NEXT: andq (%rdi), %rdx
+; AVX2-NEXT: orq %rsi, %rdx
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl $1, %eax
+; AVX512-NEXT: xorl %edx, %edx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: xorl %esi, %esi
+; AVX512-NEXT: shlxq %rcx, %rax, %rax
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rax, %rdx
+; AVX512-NEXT: cmovneq %rsi, %rax
+; AVX512-NEXT: andq 8(%rdi), %rdx
+; AVX512-NEXT: andq (%rdi), %rax
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -455,33 +623,124 @@ define i1 @test_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 8(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 12(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: complement_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btcl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: complement_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: xorq %rcx, %rsi
+; SSE-NEXT: xorq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: setne %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: complement_ne_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: andq %rsi, %r8
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: andq %rdx, %r9
+; AVX-NEXT: xorq %rcx, %rsi
+; AVX-NEXT: xorq %rax, %rdx
+; AVX-NEXT: orq %r8, %r9
+; AVX-NEXT: setne %al
+; AVX-NEXT: movq %rdx, (%rdi)
+; AVX-NEXT: movq %rsi, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -496,33 +755,124 @@ define i1 @complement_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %edx
+; X86-NEXT: movl 60(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %esi
+; X86-NEXT: movl 52(%esp,%eax), %edi
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl 8(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl (%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl 4(%ebx), %ebx
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl %ebx, %ecx
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %edi
+; X86-NEXT: movl %edx, 8(%edi)
+; X86-NEXT: movl %eax, 12(%edi)
+; X86-NEXT: movl %esi, (%edi)
+; X86-NEXT: movl %ecx, 4(%edi)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: reset_eq_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setae %al
-; X64-NEXT: btrl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: reset_eq_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: notq %rdx
+; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: andq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: sete %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: reset_eq_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: andnq %rcx, %rsi, %r8
+; AVX-NEXT: andq %rsi, %rcx
+; AVX-NEXT: andnq %rax, %rdx, %rsi
+; AVX-NEXT: andq %rdx, %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: sete %al
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq %r8, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -538,33 +888,124 @@ define i1 @reset_eq_i128(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrb $3, %al
+; X86-NEXT: andb $12, %al
+; X86-NEXT: negb %al
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 8(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl (%ecx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 12(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 8(%eax)
+; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl %ebx, 4(%eax)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: set_ne_i128:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: andl $96, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btsl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: set_ne_i128:
+; SSE: # %bb.0:
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl $1, %edx
+; SSE-NEXT: xorl %esi, %esi
+; SSE-NEXT: shldq %cl, %rdx, %rsi
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: xorl %eax, %eax
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rdx, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: andq %rsi, %r8
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: andq %rdx, %r9
+; SSE-NEXT: orq %rcx, %rsi
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: setne %al
+; SSE-NEXT: movq %rdx, (%rdi)
+; SSE-NEXT: movq %rsi, 8(%rdi)
+; SSE-NEXT: retq
+;
+; AVX-LABEL: set_ne_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: movl $1, %edx
+; AVX-NEXT: xorl %esi, %esi
+; AVX-NEXT: shldq %cl, %rdx, %rsi
+; AVX-NEXT: shlxq %rcx, %rdx, %rdx
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %rdx, %rsi
+; AVX-NEXT: cmovneq %rax, %rdx
+; AVX-NEXT: movq (%rdi), %rax
+; AVX-NEXT: movq 8(%rdi), %rcx
+; AVX-NEXT: movq %rcx, %r8
+; AVX-NEXT: andq %rsi, %r8
+; AVX-NEXT: movq %rax, %r9
+; AVX-NEXT: andq %rdx, %r9
+; AVX-NEXT: orq %rcx, %rsi
+; AVX-NEXT: orq %rax, %rdx
+; AVX-NEXT: orq %r8, %r9
+; AVX-NEXT: setne %al
+; AVX-NEXT: movq %rdx, (%rdi)
+; AVX-NEXT: movq %rsi, 8(%rdi)
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -579,55 +1020,218 @@ define i1 @set_ne_i128(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i128:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $96, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $128, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: movzbl 16(%ebp), %eax
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrb $3, %dl
+; X86-NEXT: andb $12, %dl
+; X86-NEXT: negb %dl
+; X86-NEXT: movsbl %dl, %esi
+; X86-NEXT: movl 64(%esp,%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%esp,%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esp,%esi), %ebx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl %al, %eax
+; X86-NEXT: movl 76(%esp,%esi), %edi
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl 12(%ecx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 4(%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %ebx
; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NEXT: movl 104(%esp,%ecx), %ecx
+; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movzbl 12(%ebp), %ecx
+; X86-NEXT: shldl %cl, %edi, %ebx
+; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 108(%esp,%ebx), %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 96(%esp,%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shll %cl, %ebx
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 8(%ecx)
+; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl %edx, 4(%ecx)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i128:
; SSE: # %bb.0:
; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $96, %esi
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: movl (%rdi,%rsi), %r8d
-; SSE-NEXT: btl %ecx, %r8d
-; SSE-NEXT: setae %al
-; SSE-NEXT: shll %cl, %edx
-; SSE-NEXT: btrl %ecx, %r8d
-; SSE-NEXT: orl %r8d, %edx
-; SSE-NEXT: movl %edx, (%rdi,%rsi)
+; SSE-NEXT: movl $1, %esi
+; SSE-NEXT: xorl %r8d, %r8d
+; SSE-NEXT: shldq %cl, %rsi, %r8
+; SSE-NEXT: shlq %cl, %rsi
+; SSE-NEXT: movl %edx, %eax
+; SSE-NEXT: xorl %edx, %edx
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: xorl %r9d, %r9d
+; SSE-NEXT: testb $64, %cl
+; SSE-NEXT: cmovneq %rsi, %r8
+; SSE-NEXT: cmovneq %r9, %rsi
+; SSE-NEXT: cmovneq %rax, %rdx
+; SSE-NEXT: cmovneq %r9, %rax
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: andq %r8, %r10
+; SSE-NEXT: notq %r8
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: andq %rsi, %r11
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: andq %r9, %r8
+; SSE-NEXT: orq %rdx, %r8
+; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: orq %r10, %r11
+; SSE-NEXT: sete %al
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: retq
;
-; AVX-LABEL: init_eq_i128:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: andl $96, %ecx
-; AVX-NEXT: shrl $3, %ecx
-; AVX-NEXT: movl (%rdi,%rcx), %r8d
-; AVX-NEXT: btl %esi, %r8d
-; AVX-NEXT: setae %al
-; AVX-NEXT: btrl %esi, %r8d
-; AVX-NEXT: shlxl %esi, %edx, %edx
-; AVX-NEXT: orl %r8d, %edx
-; AVX-NEXT: movl %edx, (%rdi,%rcx)
-; AVX-NEXT: retq
+; AVX2-LABEL: init_eq_i128:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: movl $1, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX2-NEXT: testb $64, %cl
+; AVX2-NEXT: cmovneq %rsi, %rax
+; AVX2-NEXT: cmovneq %r8, %rsi
+; AVX2-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX2-NEXT: cmovneq %rcx, %r9
+; AVX2-NEXT: cmovneq %r8, %rcx
+; AVX2-NEXT: movq (%rdi), %rdx
+; AVX2-NEXT: movq 8(%rdi), %r8
+; AVX2-NEXT: andnq %r8, %rax, %r10
+; AVX2-NEXT: andq %rax, %r8
+; AVX2-NEXT: andnq %rdx, %rsi, %r11
+; AVX2-NEXT: andq %rsi, %rdx
+; AVX2-NEXT: orq %r9, %r10
+; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: orq %r8, %rdx
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: movq %r11, (%rdi)
+; AVX2-NEXT: movq %r10, 8(%rdi)
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: init_eq_i128:
+; AVX512: # %bb.0:
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: xorl %eax, %eax
+; AVX512-NEXT: movl $1, %esi
+; AVX512-NEXT: xorl %r8d, %r8d
+; AVX512-NEXT: shldq %cl, %rsi, %r8
+; AVX512-NEXT: shlxq %rcx, %rsi, %rsi
+; AVX512-NEXT: movl %edx, %edx
+; AVX512-NEXT: xorl %r9d, %r9d
+; AVX512-NEXT: shldq %cl, %rdx, %r9
+; AVX512-NEXT: testb $64, %cl
+; AVX512-NEXT: cmovneq %rsi, %r8
+; AVX512-NEXT: cmovneq %rax, %rsi
+; AVX512-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512-NEXT: cmovneq %rcx, %r9
+; AVX512-NEXT: cmovneq %rax, %rcx
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: movq 8(%rdi), %rdx
+; AVX512-NEXT: andnq %rdx, %r8, %r10
+; AVX512-NEXT: andq %r8, %rdx
+; AVX512-NEXT: andnq %rax, %rsi, %r8
+; AVX512-NEXT: andq %rsi, %rax
+; AVX512-NEXT: orq %r9, %r10
+; AVX512-NEXT: orq %rcx, %r8
+; AVX512-NEXT: orq %rdx, %rax
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: movq %r8, (%rdi)
+; AVX512-NEXT: movq %r10, 8(%rdi)
+; AVX512-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
@@ -648,25 +1252,344 @@ define i1 @init_eq_i128(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i512:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: andl $60, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $224, %esp
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %eax
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %edi
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 4(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: andl 40(%ebx), %eax
+; X86-NEXT: andl 8(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 56(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 24(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: andl 44(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 12(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 60(%edi), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 28(%edi), %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edx), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: negl %edx
+; X86-NEXT: movl 192(%esp,%edx), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: andl 32(%ebx), %ecx
+; X86-NEXT: andl (%ebx), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: andl 16(%ebx), %edi
+; X86-NEXT: andl 48(%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 36(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 4(%ebx), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 20(%ebx), %ecx
+; X86-NEXT: andl 52(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: andl $60, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq -48(%rsp,%rbx), %rdx
+; SSE-NEXT: movq -40(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq -16(%rsp,%rbx), %r11
+; SSE-NEXT: movq -8(%rsp,%rbx), %r10
+; SSE-NEXT: shldq %cl, %r11, %r10
+; SSE-NEXT: movq -32(%rsp,%rbx), %r9
+; SSE-NEXT: movq -24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r8
+; SSE-NEXT: shldq %cl, %r9, %r8
+; SSE-NEXT: movq -56(%rsp,%rbx), %rsi
+; SSE-NEXT: shldq %cl, %rsi, %rdx
+; SSE-NEXT: shldq %cl, %r15, %r11
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -64(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %rsi
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: andq 32(%rdi), %r9
+; SSE-NEXT: andq 48(%rdi), %r11
+; SSE-NEXT: andq 16(%rdi), %rdx
+; SSE-NEXT: orq %r11, %rdx
+; SSE-NEXT: andq 40(%rdi), %r8
+; SSE-NEXT: andq 56(%rdi), %r10
+; SSE-NEXT: andq 24(%rdi), %rax
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: andq (%rdi), %rbx
+; SSE-NEXT: orq %r9, %rbx
+; SSE-NEXT: orq %rdx, %rbx
+; SSE-NEXT: andq 8(%rdi), %rsi
+; SSE-NEXT: orq %r8, %rsi
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: orq %rbx, %rsi
+; SSE-NEXT: setne %al
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rsi
+; AVX2-NEXT: movq -48(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq -40(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq %rbx, %rax
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq -16(%rsp,%rsi), %r11
+; AVX2-NEXT: movq -8(%rsp,%rsi), %r10
+; AVX2-NEXT: shldq %cl, %r11, %r10
+; AVX2-NEXT: movq -32(%rsp,%rsi), %r9
+; AVX2-NEXT: movq -24(%rsp,%rsi), %r14
+; AVX2-NEXT: movq %r14, %r8
+; AVX2-NEXT: shldq %cl, %r9, %r8
+; AVX2-NEXT: movq -64(%rsp,%rsi), %r15
+; AVX2-NEXT: movq -56(%rsp,%rsi), %rsi
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: shldq %cl, %r14, %r11
+; AVX2-NEXT: shldq %cl, %rbx, %r9
+; AVX2-NEXT: shldq %cl, %r15, %rsi
+; AVX2-NEXT: shlxq %rcx, %r15, %rcx
+; AVX2-NEXT: andq 32(%rdi), %r9
+; AVX2-NEXT: andq 48(%rdi), %r11
+; AVX2-NEXT: andq 16(%rdi), %rdx
+; AVX2-NEXT: andq 40(%rdi), %r8
+; AVX2-NEXT: andq 56(%rdi), %r10
+; AVX2-NEXT: andq 24(%rdi), %rax
+; AVX2-NEXT: orq %r11, %rdx
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: andq (%rdi), %rcx
+; AVX2-NEXT: orq %r9, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: andq 8(%rdi), %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %rax, %rsi
+; AVX2-NEXT: orq %rcx, %rsi
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq -48(%rsp,%rbx), %rdx
+; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq -16(%rsp,%rbx), %r11
+; AVX512-NEXT: movq -8(%rsp,%rbx), %r10
+; AVX512-NEXT: shldq %cl, %r11, %r10
+; AVX512-NEXT: movq -32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT: movq %r15, %r8
+; AVX512-NEXT: shldq %cl, %r9, %r8
+; AVX512-NEXT: movq -56(%rsp,%rbx), %rsi
+; AVX512-NEXT: shldq %cl, %rsi, %rdx
+; AVX512-NEXT: shldq %cl, %r15, %r11
+; AVX512-NEXT: shldq %cl, %r14, %r9
+; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT: shldq %cl, %rbx, %rsi
+; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT: andq 32(%rdi), %r9
+; AVX512-NEXT: andq 48(%rdi), %r11
+; AVX512-NEXT: andq 16(%rdi), %rdx
+; AVX512-NEXT: andq 40(%rdi), %r8
+; AVX512-NEXT: andq 56(%rdi), %r10
+; AVX512-NEXT: andq 24(%rdi), %rax
+; AVX512-NEXT: orq %r11, %rdx
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: andq (%rdi), %rcx
+; AVX512-NEXT: orq %r9, %rcx
+; AVX512-NEXT: orq %rdx, %rcx
+; AVX512-NEXT: andq 8(%rdi), %rsi
+; AVX512-NEXT: orq %r8, %rsi
+; AVX512-NEXT: orq %rax, %rsi
+; AVX512-NEXT: orq %rcx, %rsi
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -679,33 +1602,572 @@ define i1 @test_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: complement_ne_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btcl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $272, %esp # imm = 0x110
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 56(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl 24(%edx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 12(%eax), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl 60(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 28(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 32(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: movl (%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 16(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl 52(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: xorl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %ebx, 60(%edx)
+; X86-NEXT: movl %edi, 56(%edx)
+; X86-NEXT: movl %ecx, 52(%edx)
+; X86-NEXT: movl %esi, 44(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%edx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: complement_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btcl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: complement_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq (%rsp,%rbx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 32(%rsp,%rbx), %r8
+; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
+; SSE-NEXT: movq 16(%rsp,%rbx), %r9
+; SSE-NEXT: movq 24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -8(%rsp,%rbx), %r11
+; SSE-NEXT: shldq %cl, %r11, %rsi
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %r11
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: movq 24(%rdi), %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 16(%rdi), %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %r8, %r13
+; SSE-NEXT: andq %rsi, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %rcx, %r13
+; SSE-NEXT: andq %rbp, %r13
+; SSE-NEXT: andq %rax, %r15
+; SSE-NEXT: orq %r13, %r15
+; SSE-NEXT: movq 32(%rdi), %r14
+; SSE-NEXT: movq %r14, %rcx
+; SSE-NEXT: andq %r9, %rcx
+; SSE-NEXT: movq (%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rbx, %r13
+; SSE-NEXT: orq %rcx, %r13
+; SSE-NEXT: orq %r12, %r13
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r12
+; SSE-NEXT: andq %r10, %r12
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: orq %r12, %rax
+; SSE-NEXT: orq %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: xorq %rcx, %r10
+; SSE-NEXT: xorq %r14, %r9
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT: xorq %rdx, %r11
+; SSE-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: movq %r8, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
+; SSE-NEXT: movq %r10, 40(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %r15, 24(%rdi)
+; SSE-NEXT: movq %rbx, (%rdi)
+; SSE-NEXT: movq %r11, 8(%rdi)
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: complement_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $72, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, (%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rbx
+; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT: movq %rbp, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT: shldq %cl, %r8, %r13
+; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT: movq %r14, %r10
+; AVX2-NEXT: shldq %cl, %r9, %r10
+; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT: shldq %cl, %r11, %rsi
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r8, %r14
+; AVX2-NEXT: andq %rsi, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq 56(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r13, %r15
+; AVX2-NEXT: movq 24(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %rax, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: shldq %cl, %rbp, %r9
+; AVX2-NEXT: movq (%rsp,%rbx), %rdx
+; AVX2-NEXT: movq 32(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r9, %r15
+; AVX2-NEXT: shlxq %rcx, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq (%rdi), %rbx
+; AVX2-NEXT: movq %rbx, %rbp
+; AVX2-NEXT: andq %rax, %rbp
+; AVX2-NEXT: orq %r15, %rbp
+; AVX2-NEXT: orq %r12, %rbp
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: andq %r10, %rcx
+; AVX2-NEXT: movq 8(%rdi), %r15
+; AVX2-NEXT: movq %r15, %r12
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: orq %rcx, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT: xorq %rax, %r10
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: xorq %r15, %r11
+; AVX2-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbp, %r12
+; AVX2-NEXT: movq %r8, 48(%rdi)
+; AVX2-NEXT: movq %r13, 56(%rdi)
+; AVX2-NEXT: movq %r9, 32(%rdi)
+; AVX2-NEXT: movq %r10, 40(%rdi)
+; AVX2-NEXT: movq %rsi, 16(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %rbx, (%rdi)
+; AVX2-NEXT: movq %r11, 8(%rdi)
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $72, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: complement_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, (%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT: movq %rbp, %rax
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT: shldq %cl, %r8, %r13
+; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %r10
+; AVX512-NEXT: shldq %cl, %r9, %r10
+; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT: shldq %cl, %r11, %rsi
+; AVX512-NEXT: shldq %cl, %r14, %r8
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r8, %r14
+; AVX512-NEXT: andq %rsi, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq 56(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r13, %r15
+; AVX512-NEXT: movq 24(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %rax, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: movq (%rsp,%rbx), %rdx
+; AVX512-NEXT: movq 32(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r9, %r15
+; AVX512-NEXT: shlxq %rcx, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq (%rdi), %rbx
+; AVX512-NEXT: movq %rbx, %rbp
+; AVX512-NEXT: andq %rax, %rbp
+; AVX512-NEXT: orq %r15, %rbp
+; AVX512-NEXT: orq %r12, %rbp
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rdx, %r11
+; AVX512-NEXT: movq 40(%rdi), %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: andq %r10, %rcx
+; AVX512-NEXT: movq 8(%rdi), %r15
+; AVX512-NEXT: movq %r15, %r12
+; AVX512-NEXT: andq %r11, %r12
+; AVX512-NEXT: orq %rcx, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: xorq %rax, %r10
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT: xorq %r15, %r11
+; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: movq %r8, 48(%rdi)
+; AVX512-NEXT: movq %r13, 56(%rdi)
+; AVX512-NEXT: movq %r9, 32(%rdi)
+; AVX512-NEXT: movq %r10, 40(%rdi)
+; AVX512-NEXT: movq %rsi, 16(%rdi)
+; AVX512-NEXT: movq %rcx, 24(%rdi)
+; AVX512-NEXT: movq %rbx, (%rdi)
+; AVX512-NEXT: movq %r11, 8(%rdi)
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -720,33 +2182,606 @@ define i1 @complement_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: reset_eq_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $288, %esp # imm = 0x120
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edi
+; X86-NEXT: subl %eax, %edi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 4(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edi), %eax
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl 12(%edi), %edx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edi), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edi), %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edi), %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edi), %esi
+; X86-NEXT: movl %esi, %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl 52(%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %ebx
+; X86-NEXT: movl 44(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 256(%esp,%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: movl 32(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %ebx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%esi), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: andl %edi, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl 52(%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl %edx, 60(%eax)
+; X86-NEXT: movl %esi, 56(%eax)
+; X86-NEXT: movl %ecx, 52(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 44(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 40(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 32(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 28(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 24(%eax)
+; X86-NEXT: movl %ebx, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, 48(%eax)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: reset_eq_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setae %al
-; X64-NEXT: btrl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: reset_eq_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rdx
+; SSE-NEXT: movq (%rsp,%rdx), %r9
+; SSE-NEXT: movq 8(%rsp,%rdx), %r8
+; SSE-NEXT: movq %r8, %rsi
+; SSE-NEXT: shldq %cl, %r9, %rsi
+; SSE-NEXT: movq -8(%rsp,%rdx), %rax
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: movq 16(%rsp,%rdx), %r14
+; SSE-NEXT: movq 24(%rsp,%rdx), %r10
+; SSE-NEXT: movq %r10, %rbx
+; SSE-NEXT: shldq %cl, %r14, %rbx
+; SSE-NEXT: shldq %cl, %r8, %r14
+; SSE-NEXT: movq 32(%rsp,%rdx), %r13
+; SSE-NEXT: movq 40(%rsp,%rdx), %r12
+; SSE-NEXT: shldq %cl, %r13, %r12
+; SSE-NEXT: shldq %cl, %r10, %r13
+; SSE-NEXT: movq -16(%rsp,%rdx), %rdx
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %r12, %rbp
+; SSE-NEXT: movq %r9, %r15
+; SSE-NEXT: movq %rsi, %r11
+; SSE-NEXT: movq 16(%rdi), %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r13
+; SSE-NEXT: andq %r8, %r9
+; SSE-NEXT: orq %r13, %r9
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r12
+; SSE-NEXT: movq 24(%rdi), %r10
+; SSE-NEXT: andq %r10, %rsi
+; SSE-NEXT: orq %r12, %rsi
+; SSE-NEXT: movq %r14, %r13
+; SSE-NEXT: movq 32(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %r14
+; SSE-NEXT: movq %rdx, %r12
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rcx, %rdx
+; SSE-NEXT: orq %r14, %rdx
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: movq %rbx, %r14
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: andq %rcx, %rbx
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: andq %r8, %rax
+; SSE-NEXT: orq %rbx, %rax
+; SSE-NEXT: orq %rsi, %rax
+; SSE-NEXT: notq %r11
+; SSE-NEXT: andq %r10, %r11
+; SSE-NEXT: notq %r15
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: notq %r14
+; SSE-NEXT: andq %rcx, %r14
+; SSE-NEXT: notq %r13
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE-NEXT: notq %rbp
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: notq %rcx
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; SSE-NEXT: notq %r9
+; SSE-NEXT: andq %r8, %r9
+; SSE-NEXT: notq %r12
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rcx, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r13, 32(%rdi)
+; SSE-NEXT: movq %r14, 40(%rdi)
+; SSE-NEXT: movq %r15, 16(%rdi)
+; SSE-NEXT: movq %r11, 24(%rdi)
+; SSE-NEXT: movq %r12, (%rdi)
+; SSE-NEXT: movq %r9, 8(%rdi)
+; SSE-NEXT: sete %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: reset_eq_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: pushq %rax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rdx
+; AVX2-NEXT: movq -48(%rsp,%rdx), %r8
+; AVX2-NEXT: movq -40(%rsp,%rdx), %rbx
+; AVX2-NEXT: movq %rbx, %rax
+; AVX2-NEXT: shldq %cl, %r8, %rax
+; AVX2-NEXT: movq -16(%rsp,%rdx), %r10
+; AVX2-NEXT: movq -8(%rsp,%rdx), %rsi
+; AVX2-NEXT: shldq %cl, %r10, %rsi
+; AVX2-NEXT: movq -32(%rsp,%rdx), %r11
+; AVX2-NEXT: movq -24(%rsp,%rdx), %r14
+; AVX2-NEXT: movq %r14, %r9
+; AVX2-NEXT: shldq %cl, %r11, %r9
+; AVX2-NEXT: movq -64(%rsp,%rdx), %r15
+; AVX2-NEXT: movq -56(%rsp,%rdx), %rdx
+; AVX2-NEXT: shldq %cl, %rdx, %r8
+; AVX2-NEXT: shldq %cl, %r14, %r10
+; AVX2-NEXT: shldq %cl, %rbx, %r11
+; AVX2-NEXT: shldq %cl, %r15, %rdx
+; AVX2-NEXT: shlxq %rcx, %r15, %rcx
+; AVX2-NEXT: movq 24(%rdi), %rbx
+; AVX2-NEXT: movq 56(%rdi), %r14
+; AVX2-NEXT: movq 16(%rdi), %r15
+; AVX2-NEXT: movq 48(%rdi), %r13
+; AVX2-NEXT: movq 32(%rdi), %rbp
+; AVX2-NEXT: andnq %rbp, %r11, %r12
+; AVX2-NEXT: andq %r11, %rbp
+; AVX2-NEXT: andnq %r13, %r10, %r11
+; AVX2-NEXT: andq %r10, %r13
+; AVX2-NEXT: andnq %r15, %r8, %r10
+; AVX2-NEXT: andq %r8, %r15
+; AVX2-NEXT: movq 40(%rdi), %r8
+; AVX2-NEXT: orq %r13, %r15
+; AVX2-NEXT: andnq %r8, %r9, %r13
+; AVX2-NEXT: andq %r9, %r8
+; AVX2-NEXT: andnq %r14, %rsi, %r9
+; AVX2-NEXT: andq %rsi, %r14
+; AVX2-NEXT: andnq %rbx, %rax, %rsi
+; AVX2-NEXT: andq %rax, %rbx
+; AVX2-NEXT: movq (%rdi), %rax
+; AVX2-NEXT: orq %r14, %rbx
+; AVX2-NEXT: andnq %rax, %rcx, %r14
+; AVX2-NEXT: andq %rcx, %rax
+; AVX2-NEXT: orq %rbp, %rax
+; AVX2-NEXT: movq 8(%rdi), %rcx
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: andnq %rcx, %rdx, %r15
+; AVX2-NEXT: andq %rdx, %rcx
+; AVX2-NEXT: orq %r8, %rcx
+; AVX2-NEXT: orq %rbx, %rcx
+; AVX2-NEXT: orq %rax, %rcx
+; AVX2-NEXT: movq %r11, 48(%rdi)
+; AVX2-NEXT: movq %r9, 56(%rdi)
+; AVX2-NEXT: movq %r12, 32(%rdi)
+; AVX2-NEXT: movq %r13, 40(%rdi)
+; AVX2-NEXT: movq %r10, 16(%rdi)
+; AVX2-NEXT: movq %rsi, 24(%rdi)
+; AVX2-NEXT: movq %r14, (%rdi)
+; AVX2-NEXT: movq %r15, 8(%rdi)
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: addq $8, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: reset_eq_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: pushq %rax
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq -48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq -40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %rax
+; AVX512-NEXT: shldq %cl, %r8, %rax
+; AVX512-NEXT: movq -16(%rsp,%rbx), %r10
+; AVX512-NEXT: movq -8(%rsp,%rbx), %rsi
+; AVX512-NEXT: shldq %cl, %r10, %rsi
+; AVX512-NEXT: movq -32(%rsp,%rbx), %r11
+; AVX512-NEXT: movq -24(%rsp,%rbx), %r15
+; AVX512-NEXT: movq %r15, %r9
+; AVX512-NEXT: shldq %cl, %r11, %r9
+; AVX512-NEXT: movq -56(%rsp,%rbx), %rdx
+; AVX512-NEXT: shldq %cl, %rdx, %r8
+; AVX512-NEXT: shldq %cl, %r15, %r10
+; AVX512-NEXT: shldq %cl, %r14, %r11
+; AVX512-NEXT: movq -64(%rsp,%rbx), %rbx
+; AVX512-NEXT: shldq %cl, %rbx, %rdx
+; AVX512-NEXT: shlxq %rcx, %rbx, %rcx
+; AVX512-NEXT: movq 24(%rdi), %rbx
+; AVX512-NEXT: movq 56(%rdi), %r14
+; AVX512-NEXT: movq 16(%rdi), %r15
+; AVX512-NEXT: movq 48(%rdi), %r13
+; AVX512-NEXT: movq 32(%rdi), %rbp
+; AVX512-NEXT: andnq %rbp, %r11, %r12
+; AVX512-NEXT: andq %r11, %rbp
+; AVX512-NEXT: andnq %r13, %r10, %r11
+; AVX512-NEXT: andq %r10, %r13
+; AVX512-NEXT: andnq %r15, %r8, %r10
+; AVX512-NEXT: andq %r8, %r15
+; AVX512-NEXT: movq 40(%rdi), %r8
+; AVX512-NEXT: orq %r13, %r15
+; AVX512-NEXT: andnq %r8, %r9, %r13
+; AVX512-NEXT: andq %r9, %r8
+; AVX512-NEXT: andnq %r14, %rsi, %r9
+; AVX512-NEXT: andq %rsi, %r14
+; AVX512-NEXT: andnq %rbx, %rax, %rsi
+; AVX512-NEXT: andq %rax, %rbx
+; AVX512-NEXT: movq (%rdi), %rax
+; AVX512-NEXT: orq %r14, %rbx
+; AVX512-NEXT: andnq %rax, %rcx, %r14
+; AVX512-NEXT: andq %rcx, %rax
+; AVX512-NEXT: orq %rbp, %rax
+; AVX512-NEXT: movq 8(%rdi), %rcx
+; AVX512-NEXT: orq %r15, %rax
+; AVX512-NEXT: andnq %rcx, %rdx, %r15
+; AVX512-NEXT: andq %rdx, %rcx
+; AVX512-NEXT: orq %r8, %rcx
+; AVX512-NEXT: orq %rbx, %rcx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: movq %r11, 48(%rdi)
+; AVX512-NEXT: movq %r9, 56(%rdi)
+; AVX512-NEXT: movq %r12, 32(%rdi)
+; AVX512-NEXT: movq %r13, 40(%rdi)
+; AVX512-NEXT: movq %r10, 16(%rdi)
+; AVX512-NEXT: movq %rsi, 24(%rdi)
+; AVX512-NEXT: movq %r14, (%rdi)
+; AVX512-NEXT: movq %r15, 8(%rdi)
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: addq $8, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -762,33 +2797,572 @@ define i1 @reset_eq_i512(ptr %word, i32 %position) nounwind {
define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
; X86-LABEL: set_ne_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%ecx,%esi), %edi
-; X86-NEXT: btl %edx, %edi
-; X86-NEXT: setb %al
-; X86-NEXT: btsl %edx, %edi
-; X86-NEXT: movl %edi, (%ecx,%esi)
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $272, %esp # imm = 0x110
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: shrl $3, %eax
+; X86-NEXT: andl $60, %eax
+; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT: leal {{[0-9]+}}(%esp), %edx
+; X86-NEXT: subl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%edx), %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%edx), %ebx
+; X86-NEXT: movl %ebx, %esi
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%edx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 52(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl 40(%edx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: movl 8(%edx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl 56(%edx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %ebx
+; X86-NEXT: movl 24(%edx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%eax), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 12(%eax), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: movl 60(%eax), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 28(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl (%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: negl %eax
+; X86-NEXT: movl 240(%esp,%eax), %esi
+; X86-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, (%esp) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl 32(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: movl (%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl 16(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%esi), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: movl 52(%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: orl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, (%esp) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: movl %ebx, 60(%edx)
+; X86-NEXT: movl %edi, 56(%edx)
+; X86-NEXT: movl %ecx, 52(%edx)
+; X86-NEXT: movl %esi, 44(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 40(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 36(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 32(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 28(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 24(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 20(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 16(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 12(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 8(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 4(%edx)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%edx)
+; X86-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, 48(%edx)
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: set_ne_i512:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %ecx
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $60, %ecx
-; X64-NEXT: movl (%rdi,%rcx), %edx
-; X64-NEXT: btl %esi, %edx
-; X64-NEXT: setb %al
-; X64-NEXT: btsl %esi, %edx
-; X64-NEXT: movl %edx, (%rdi,%rcx)
-; X64-NEXT: retq
+; SSE-LABEL: set_ne_i512:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $56, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %esi
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %rbx
+; SSE-NEXT: movq (%rsp,%rbx), %rsi
+; SSE-NEXT: movq 8(%rsp,%rbx), %r14
+; SSE-NEXT: movq %r14, %rax
+; SSE-NEXT: shldq %cl, %rsi, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 32(%rsp,%rbx), %r8
+; SSE-NEXT: movq 40(%rsp,%rbx), %rbp
+; SSE-NEXT: shldq %cl, %r8, %rbp
+; SSE-NEXT: movq 16(%rsp,%rbx), %r9
+; SSE-NEXT: movq 24(%rsp,%rbx), %r15
+; SSE-NEXT: movq %r15, %r10
+; SSE-NEXT: shldq %cl, %r9, %r10
+; SSE-NEXT: movq -8(%rsp,%rbx), %r11
+; SSE-NEXT: shldq %cl, %r11, %rsi
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: shldq %cl, %r14, %r9
+; SSE-NEXT: movq -16(%rsp,%rbx), %rbx
+; SSE-NEXT: shldq %cl, %rbx, %r11
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rbx
+; SSE-NEXT: movq 24(%rdi), %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 56(%rdi), %rcx
+; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 16(%rdi), %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %r8, %r13
+; SSE-NEXT: andq %rsi, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %rcx, %r13
+; SSE-NEXT: andq %rbp, %r13
+; SSE-NEXT: andq %rax, %r15
+; SSE-NEXT: orq %r13, %r15
+; SSE-NEXT: movq 32(%rdi), %r14
+; SSE-NEXT: movq %r14, %rcx
+; SSE-NEXT: andq %r9, %rcx
+; SSE-NEXT: movq (%rdi), %r13
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rbx, %r13
+; SSE-NEXT: orq %rcx, %r13
+; SSE-NEXT: orq %r12, %r13
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq %rcx, %r12
+; SSE-NEXT: andq %r10, %r12
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, %rax
+; SSE-NEXT: andq %r11, %rax
+; SSE-NEXT: orq %r12, %rax
+; SSE-NEXT: orq %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: orq %rcx, %r10
+; SSE-NEXT: orq %r14, %r9
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: movq %r8, 48(%rdi)
+; SSE-NEXT: movq %rbp, 56(%rdi)
+; SSE-NEXT: movq %r9, 32(%rdi)
+; SSE-NEXT: movq %r10, 40(%rdi)
+; SSE-NEXT: movq %rsi, 16(%rdi)
+; SSE-NEXT: movq %r15, 24(%rdi)
+; SSE-NEXT: movq %rbx, (%rdi)
+; SSE-NEXT: movq %r11, 8(%rdi)
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $56, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: set_ne_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $72, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, (%rsp)
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rbx
+; AVX2-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX2-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX2-NEXT: movq %rbp, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX2-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX2-NEXT: shldq %cl, %r8, %r13
+; AVX2-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX2-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX2-NEXT: movq %r14, %r10
+; AVX2-NEXT: shldq %cl, %r9, %r10
+; AVX2-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX2-NEXT: shldq %cl, %r11, %rsi
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 48(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r8, %r14
+; AVX2-NEXT: andq %rsi, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq 56(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r13, %r15
+; AVX2-NEXT: movq 24(%rdi), %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %rax, %r14
+; AVX2-NEXT: orq %r15, %r14
+; AVX2-NEXT: shldq %cl, %rbp, %r9
+; AVX2-NEXT: movq (%rsp,%rbx), %rdx
+; AVX2-NEXT: movq 32(%rdi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r9, %r15
+; AVX2-NEXT: shlxq %rcx, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq (%rdi), %rbx
+; AVX2-NEXT: movq %rbx, %rbp
+; AVX2-NEXT: andq %rax, %rbp
+; AVX2-NEXT: orq %r15, %rbp
+; AVX2-NEXT: orq %r12, %rbp
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq %rax, %rcx
+; AVX2-NEXT: andq %r10, %rcx
+; AVX2-NEXT: movq 8(%rdi), %r15
+; AVX2-NEXT: movq %r15, %r12
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: orq %rcx, %r12
+; AVX2-NEXT: orq %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX2-NEXT: orq %rax, %r10
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX2-NEXT: orq %r15, %r11
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX2-NEXT: orq %rbp, %r12
+; AVX2-NEXT: movq %r8, 48(%rdi)
+; AVX2-NEXT: movq %r13, 56(%rdi)
+; AVX2-NEXT: movq %r9, 32(%rdi)
+; AVX2-NEXT: movq %r10, 40(%rdi)
+; AVX2-NEXT: movq %rsi, 16(%rdi)
+; AVX2-NEXT: movq %rcx, 24(%rdi)
+; AVX2-NEXT: movq %rbx, (%rdi)
+; AVX2-NEXT: movq %r11, 8(%rdi)
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $72, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: set_ne_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $72, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, (%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rbx
+; AVX512-NEXT: movq 16(%rsp,%rbx), %rsi
+; AVX512-NEXT: movq 24(%rsp,%rbx), %rbp
+; AVX512-NEXT: movq %rbp, %rax
+; AVX512-NEXT: shldq %cl, %rsi, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rsp,%rbx), %r8
+; AVX512-NEXT: movq 56(%rsp,%rbx), %r13
+; AVX512-NEXT: shldq %cl, %r8, %r13
+; AVX512-NEXT: movq 32(%rsp,%rbx), %r9
+; AVX512-NEXT: movq 40(%rsp,%rbx), %r14
+; AVX512-NEXT: movq %r14, %r10
+; AVX512-NEXT: shldq %cl, %r9, %r10
+; AVX512-NEXT: movq 8(%rsp,%rbx), %r11
+; AVX512-NEXT: shldq %cl, %r11, %rsi
+; AVX512-NEXT: shldq %cl, %r14, %r8
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 48(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r8, %r14
+; AVX512-NEXT: andq %rsi, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq 56(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r13, %r15
+; AVX512-NEXT: movq 24(%rdi), %r14
+; AVX512-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %rax, %r14
+; AVX512-NEXT: orq %r15, %r14
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: movq (%rsp,%rbx), %rdx
+; AVX512-NEXT: movq 32(%rdi), %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r9, %r15
+; AVX512-NEXT: shlxq %rcx, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq (%rdi), %rbx
+; AVX512-NEXT: movq %rbx, %rbp
+; AVX512-NEXT: andq %rax, %rbp
+; AVX512-NEXT: orq %r15, %rbp
+; AVX512-NEXT: orq %r12, %rbp
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rdx, %r11
+; AVX512-NEXT: movq 40(%rdi), %rax
+; AVX512-NEXT: movq %rax, %rcx
+; AVX512-NEXT: andq %r10, %rcx
+; AVX512-NEXT: movq 8(%rdi), %r15
+; AVX512-NEXT: movq %r15, %r12
+; AVX512-NEXT: andq %r11, %r12
+; AVX512-NEXT: orq %rcx, %r12
+; AVX512-NEXT: orq %r14, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: orq %rax, %r10
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; AVX512-NEXT: orq %r15, %r11
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
+; AVX512-NEXT: orq %rbp, %r12
+; AVX512-NEXT: movq %r8, 48(%rdi)
+; AVX512-NEXT: movq %r13, 56(%rdi)
+; AVX512-NEXT: movq %r9, 32(%rdi)
+; AVX512-NEXT: movq %r10, 40(%rdi)
+; AVX512-NEXT: movq %rsi, 16(%rdi)
+; AVX512-NEXT: movq %rcx, 24(%rdi)
+; AVX512-NEXT: movq %rbx, (%rdi)
+; AVX512-NEXT: movq %r11, 8(%rdi)
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $72, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -803,55 +3377,883 @@ define i1 @set_ne_i512(ptr %word, i32 %position) nounwind {
define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
; X86-LABEL: init_eq_i512:
; X86: # %bb.0:
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: shrl $3, %esi
-; X86-NEXT: andl $60, %esi
-; X86-NEXT: movl (%edx,%esi), %edi
-; X86-NEXT: btl %ecx, %edi
-; X86-NEXT: setae %al
-; X86-NEXT: btrl %ecx, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $432, %esp # imm = 0x1B0
+; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: shrl $3, %edx
+; X86-NEXT: andl $60, %edx
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%esi), %eax
+; X86-NEXT: movl 48(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%esi), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%esi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl 16(%ebp), %ebx
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: orl %edi, %ebx
-; X86-NEXT: movl %ebx, (%edx,%esi)
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shldl %cl, %edi, %edx
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebx), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %eax, %edx
+; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl (%ebx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 4(%ebx), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl 56(%edi), %ebx
+; X86-NEXT: movl 60(%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 52(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 48(%edi), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 40(%edi), %ebx
+; X86-NEXT: movl 44(%edi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 36(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 32(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 28(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 24(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 16(%edi), %ebx
+; X86-NEXT: shldl %cl, %ebx, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl 12(%edi), %eax
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: notl %esi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl 8(%edi), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: notl %eax
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 4(%edi), %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %esi, %eax
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: notl %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl (%edi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %ebx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: notl %edi
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 60(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 56(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 52(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 44(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 40(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 36(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 32(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 28(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 24(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 20(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 16(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ecx, (%eax)
+; X86-NEXT: movl %esi, 48(%eax)
+; X86-NEXT: sete %al
+; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
; SSE-LABEL: init_eq_i512:
; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $216, %rsp
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: andl $63, %ecx
; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $60, %esi
-; SSE-NEXT: movl (%rdi,%rsi), %r8d
-; SSE-NEXT: btl %ecx, %r8d
-; SSE-NEXT: setae %al
-; SSE-NEXT: shll %cl, %edx
-; SSE-NEXT: btrl %ecx, %r8d
-; SSE-NEXT: orl %r8d, %edx
-; SSE-NEXT: movl %edx, (%rdi,%rsi)
+; SSE-NEXT: andl $56, %esi
+; SSE-NEXT: negl %esi
+; SSE-NEXT: movslq %esi, %r10
+; SSE-NEXT: movq 184(%rsp,%r10), %r11
+; SSE-NEXT: movq 192(%rsp,%r10), %rsi
+; SSE-NEXT: movq %rsi, %r13
+; SSE-NEXT: shldq %cl, %r11, %r13
+; SSE-NEXT: movq 200(%rsp,%r10), %r15
+; SSE-NEXT: shldq %cl, %rsi, %r15
+; SSE-NEXT: movq 168(%rsp,%r10), %rbx
+; SSE-NEXT: movq 176(%rsp,%r10), %rsi
+; SSE-NEXT: movq %rsi, %r14
+; SSE-NEXT: shldq %cl, %rbx, %r14
+; SSE-NEXT: shldq %cl, %rsi, %r11
+; SSE-NEXT: movq 152(%rsp,%r10), %rax
+; SSE-NEXT: movq 160(%rsp,%r10), %r8
+; SSE-NEXT: movq %r8, %r12
+; SSE-NEXT: shldq %cl, %rax, %r12
+; SSE-NEXT: shldq %cl, %r8, %rbx
+; SSE-NEXT: movq 144(%rsp,%r10), %r9
+; SSE-NEXT: movq %r9, %r8
+; SSE-NEXT: shlq %cl, %r8
+; SSE-NEXT: shldq %cl, %r9, %rax
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movl %edx, %edx
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, (%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq 16(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 48(%rdi), %rsi
+; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rsi, %r13
+; SSE-NEXT: andq %rdx, %r12
+; SSE-NEXT: orq %r13, %r12
+; SSE-NEXT: movq %r15, %rsi
+; SSE-NEXT: movq 56(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r15
+; SSE-NEXT: movq %rbx, %r13
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %rbx
+; SSE-NEXT: orq %r15, %rbx
+; SSE-NEXT: movq %r14, %rbp
+; SSE-NEXT: movq 32(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r14
+; SSE-NEXT: movq %r8, %r15
+; SSE-NEXT: movq (%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %r8
+; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: orq %r12, %r8
+; SSE-NEXT: movq %r11, %r12
+; SSE-NEXT: movq 40(%rdi), %r9
+; SSE-NEXT: andq %r9, %r11
+; SSE-NEXT: movq %rax, %r14
+; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq %rdx, %rax
+; SSE-NEXT: orq %r11, %rax
+; SSE-NEXT: orq %rbx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: notq %rax
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq 56(%rsp,%r10), %r11
+; SSE-NEXT: movq 64(%rsp,%r10), %rax
+; SSE-NEXT: movq %rax, %rbx
+; SSE-NEXT: shldq %cl, %r11, %rbx
+; SSE-NEXT: orq %rbx, %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: notq %rsi
+; SSE-NEXT: movq 72(%rsp,%r10), %rbx
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; SSE-NEXT: orq %rbx, %rsi
+; SSE-NEXT: notq %rbp
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Folded Reload
+; SSE-NEXT: movq 40(%rsp,%r10), %rax
+; SSE-NEXT: movq 48(%rsp,%r10), %rdx
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: orq %rbx, %rbp
+; SSE-NEXT: notq %r12
+; SSE-NEXT: andq %r9, %r12
+; SSE-NEXT: shldq %cl, %rdx, %r11
+; SSE-NEXT: movq 24(%rsp,%r10), %r9
+; SSE-NEXT: movq 32(%rsp,%r10), %rdx
+; SSE-NEXT: movq %rdx, %rbx
+; SSE-NEXT: shldq %cl, %r9, %rbx
+; SSE-NEXT: orq %r11, %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: notq %r11
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: orq %rbx, %r11
+; SSE-NEXT: notq %r13
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; SSE-NEXT: orq %rax, %r13
+; SSE-NEXT: notq %r15
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
+; SSE-NEXT: movq 16(%rsp,%r10), %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: shlq %cl, %rdx
+; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: notq %r14
+; SSE-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shldq %cl, %rax, %r9
+; SSE-NEXT: orq %r9, %r14
+; SSE-NEXT: orq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: movq %rax, 48(%rdi)
+; SSE-NEXT: movq %rsi, 56(%rdi)
+; SSE-NEXT: movq %rbp, 32(%rdi)
+; SSE-NEXT: movq %r12, 40(%rdi)
+; SSE-NEXT: movq %r11, 16(%rdi)
+; SSE-NEXT: movq %r13, 24(%rdi)
+; SSE-NEXT: movq %r15, (%rdi)
+; SSE-NEXT: movq %r14, 8(%rdi)
+; SSE-NEXT: sete %al
+; SSE-NEXT: addq $216, %rsp
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
-; AVX-LABEL: init_eq_i512:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %esi, %ecx
-; AVX-NEXT: shrl $3, %ecx
-; AVX-NEXT: andl $60, %ecx
-; AVX-NEXT: movl (%rdi,%rcx), %r8d
-; AVX-NEXT: btl %esi, %r8d
-; AVX-NEXT: setae %al
-; AVX-NEXT: btrl %esi, %r8d
-; AVX-NEXT: shlxl %esi, %edx, %edx
-; AVX-NEXT: orl %r8d, %edx
-; AVX-NEXT: movl %edx, (%rdi,%rcx)
-; AVX-NEXT: retq
+; AVX2-LABEL: init_eq_i512:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $200, %rsp
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %esi, %r8d
+; AVX2-NEXT: andl $63, %r8d
+; AVX2-NEXT: shrl $3, %esi
+; AVX2-NEXT: andl $56, %esi
+; AVX2-NEXT: negl %esi
+; AVX2-NEXT: movslq %esi, %rsi
+; AVX2-NEXT: movq 144(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 152(%rsp,%rsi), %r12
+; AVX2-NEXT: movq %r12, %r10
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %r11, %r10
+; AVX2-NEXT: movq 176(%rsp,%rsi), %r14
+; AVX2-NEXT: movq 184(%rsp,%rsi), %r9
+; AVX2-NEXT: shldq %cl, %r14, %r9
+; AVX2-NEXT: movq 160(%rsp,%rsi), %r15
+; AVX2-NEXT: movq 168(%rsp,%rsi), %r13
+; AVX2-NEXT: movq %r13, %rbx
+; AVX2-NEXT: shldq %cl, %r15, %rbx
+; AVX2-NEXT: movq 128(%rsp,%rsi), %rbp
+; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 136(%rsp,%rsi), %rax
+; AVX2-NEXT: shldq %cl, %rax, %r11
+; AVX2-NEXT: shldq %cl, %r13, %r14
+; AVX2-NEXT: shldq %cl, %r12, %r15
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movl %edx, %edx
+; AVX2-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq %rdx, (%rsp)
+; AVX2-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: movq 16(%rdi), %r12
+; AVX2-NEXT: movq 48(%rdi), %rbp
+; AVX2-NEXT: movq 32(%rdi), %r13
+; AVX2-NEXT: andnq %r13, %r15, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r15, %r13
+; AVX2-NEXT: andnq %rbp, %r14, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r14, %rbp
+; AVX2-NEXT: andnq %r12, %r11, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r11, %r12
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: orq %rbp, %r12
+; AVX2-NEXT: andnq %rax, %rbx, %rcx
+; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq %rax, %rbp
+; AVX2-NEXT: andq %rbx, %rbp
+; AVX2-NEXT: movq 56(%rdi), %rcx
+; AVX2-NEXT: andnq %rcx, %r9, %rbx
+; AVX2-NEXT: andq %r9, %rcx
+; AVX2-NEXT: movq 24(%rdi), %rax
+; AVX2-NEXT: andnq %rax, %r10, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq %r10, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: shlxq %r8, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; AVX2-NEXT: movq (%rdi), %r10
+; AVX2-NEXT: andnq %r10, %rcx, %r15
+; AVX2-NEXT: andq %rcx, %r10
+; AVX2-NEXT: movq 40(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 48(%rsp,%rsi), %r11
+; AVX2-NEXT: movq %r11, %r9
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %rdx, %r9
+; AVX2-NEXT: orq %r13, %r10
+; AVX2-NEXT: orq %r12, %r10
+; AVX2-NEXT: movq 8(%rdi), %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: andnq %r13, %rcx, %r12
+; AVX2-NEXT: andq %rcx, %r13
+; AVX2-NEXT: orq %rbp, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq 56(%rsp,%rsi), %rax
+; AVX2-NEXT: movl %r8d, %ecx
+; AVX2-NEXT: shldq %cl, %r11, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: orq %r9, %r14
+; AVX2-NEXT: orq %rax, %rbx
+; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 24(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 32(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, %r11
+; AVX2-NEXT: shldq %cl, %rax, %r11
+; AVX2-NEXT: shldq %cl, %r9, %rdx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT: orq %r11, %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: orq %rdx, %rbx
+; AVX2-NEXT: movq 8(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq 16(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, %r11
+; AVX2-NEXT: shldq %cl, %rdx, %r11
+; AVX2-NEXT: shldq %cl, %r9, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq (%rsp,%rsi), %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: shlxq %r8, %rsi, %rax
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: orq %rax, %r15
+; AVX2-NEXT: orq %rdx, %r12
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: movq %r14, 48(%rdi)
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: movq %rax, 56(%rdi)
+; AVX2-NEXT: movq %rbp, 32(%rdi)
+; AVX2-NEXT: movq %rbx, 40(%rdi)
+; AVX2-NEXT: movq %r9, 16(%rdi)
+; AVX2-NEXT: movq %r11, 24(%rdi)
+; AVX2-NEXT: movq %r15, (%rdi)
+; AVX2-NEXT: movq %r12, 8(%rdi)
+; AVX2-NEXT: sete %al
+; AVX2-NEXT: addq $200, %rsp
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: init_eq_i512:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $184, %rsp
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %esi
+; AVX512-NEXT: andl $56, %esi
+; AVX512-NEXT: negl %esi
+; AVX512-NEXT: movslq %esi, %rsi
+; AVX512-NEXT: movq 128(%rsp,%rsi), %r10
+; AVX512-NEXT: movq 136(%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rax
+; AVX512-NEXT: shldq %cl, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 160(%rsp,%rsi), %r14
+; AVX512-NEXT: movq 168(%rsp,%rsi), %rax
+; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 144(%rsp,%rsi), %r15
+; AVX512-NEXT: movq 152(%rsp,%rsi), %r11
+; AVX512-NEXT: movq %r11, %rbx
+; AVX512-NEXT: shldq %cl, %r15, %rbx
+; AVX512-NEXT: movq 120(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %r10
+; AVX512-NEXT: shldq %cl, %r11, %r14
+; AVX512-NEXT: movq %rdi, %r9
+; AVX512-NEXT: movq 112(%rsp,%rsi), %r11
+; AVX512-NEXT: shldq %cl, %r12, %r15
+; AVX512-NEXT: movl %edx, %edx
+; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX512-NEXT: vmovups %xmm1, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: movq 16(%rdi), %r12
+; AVX512-NEXT: movq 48(%rdi), %r13
+; AVX512-NEXT: movq 32(%rdi), %rbp
+; AVX512-NEXT: andnq %rbp, %r15, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r15, %rbp
+; AVX512-NEXT: andnq %r13, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r14, %r13
+; AVX512-NEXT: andnq %r12, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq %r10, %r12
+; AVX512-NEXT: movq 40(%rdi), %r8
+; AVX512-NEXT: orq %r13, %r12
+; AVX512-NEXT: andnq %r8, %rbx, %rdi
+; AVX512-NEXT: andq %rbx, %r8
+; AVX512-NEXT: movq 56(%r9), %r13
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: andnq %r13, %rdx, %r10
+; AVX512-NEXT: andq %rdx, %r13
+; AVX512-NEXT: movq 24(%r9), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX512-NEXT: andnq %rax, %rdx, %r15
+; AVX512-NEXT: andq %rdx, %rax
+; AVX512-NEXT: orq %r13, %rax
+; AVX512-NEXT: shlxq %rcx, %r11, %r13
+; AVX512-NEXT: movq (%r9), %rdx
+; AVX512-NEXT: andnq %rdx, %r13, %r14
+; AVX512-NEXT: andq %r13, %rdx
+; AVX512-NEXT: orq %rbp, %rdx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r11, %rbp
+; AVX512-NEXT: orq %r12, %rdx
+; AVX512-NEXT: movq 8(%r9), %r13
+; AVX512-NEXT: andnq %r13, %rbp, %rbx
+; AVX512-NEXT: andq %rbp, %r13
+; AVX512-NEXT: orq %r8, %r13
+; AVX512-NEXT: movq 24(%rsp,%rsi), %r8
+; AVX512-NEXT: orq %rax, %r13
+; AVX512-NEXT: movq 32(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, %r12
+; AVX512-NEXT: shldq %cl, %r8, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: orq %r12, %r11
+; AVX512-NEXT: movq 40(%rsp,%rsi), %r12
+; AVX512-NEXT: shldq %cl, %rax, %r12
+; AVX512-NEXT: orq %r12, %r10
+; AVX512-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 8(%rsp,%rsi), %rax
+; AVX512-NEXT: movq 16(%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rbp
+; AVX512-NEXT: shldq %cl, %rax, %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: orq %rbp, %r10
+; AVX512-NEXT: shldq %cl, %r12, %r8
+; AVX512-NEXT: orq %r8, %rdi
+; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq -8(%rsp,%rsi), %r8
+; AVX512-NEXT: movq (%rsp,%rsi), %r12
+; AVX512-NEXT: movq %r12, %rbp
+; AVX512-NEXT: shldq %cl, %r8, %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT: orq %rbp, %rdi
+; AVX512-NEXT: movq -16(%rsp,%rsi), %rsi
+; AVX512-NEXT: shldq %cl, %r12, %rax
+; AVX512-NEXT: orq %rax, %r15
+; AVX512-NEXT: shlxq %rcx, %rsi, %rax
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rsi, %r8
+; AVX512-NEXT: orq %rax, %r14
+; AVX512-NEXT: orq %r8, %rbx
+; AVX512-NEXT: orq %rdx, %r13
+; AVX512-NEXT: movq %r11, 48(%r9)
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq %rax, 56(%r9)
+; AVX512-NEXT: movq %r10, 32(%r9)
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: movq %rax, 40(%r9)
+; AVX512-NEXT: movq %rdi, 16(%r9)
+; AVX512-NEXT: movq %r15, 24(%r9)
+; AVX512-NEXT: movq %r14, (%r9)
+; AVX512-NEXT: movq %rbx, 8(%r9)
+; AVX512-NEXT: sete %al
+; AVX512-NEXT: addq $184, %rsp
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 511
%ofs = zext nneg i32 %rem to i512
%bit = shl nuw i512 1, %ofs
@@ -872,25 +4274,2749 @@ define i1 @init_eq_i512(ptr %word, i32 %position, i1 zeroext %value) nounwind {
define i1 @test_ne_i4096(ptr %word, i32 %position) nounwind {
; X86-LABEL: test_ne_i4096:
; X86: # %bb.0:
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, %edx
-; X86-NEXT: andl $4064, %edx # imm = 0xFE0
-; X86-NEXT: shrl $3, %edx
-; X86-NEXT: movl (%eax,%edx), %eax
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: setb %al
+; X86-NEXT: pushl %ebp
+; X86-NEXT: movl %esp, %ebp
+; X86-NEXT: pushl %ebx
+; X86-NEXT: pushl %edi
+; X86-NEXT: pushl %esi
+; X86-NEXT: andl $-16, %esp
+; X86-NEXT: subl $1792, %esp # imm = 0x700
+; X86-NEXT: movl 12(%ebp), %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shrl $3, %ecx
+; X86-NEXT: andl $508, %ecx # imm = 0x1FC
+; X86-NEXT: leal {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: subl %ecx, %esi
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 248(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 252(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl $31, %ebx
+; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 504(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 508(%esi), %edx
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 120(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 124(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 376(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 380(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 184(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 188(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 440(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 444(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 56(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 60(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 312(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 316(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 216(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 220(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 472(%esi), %edi
+; X86-NEXT: movl 476(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 88(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 344(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 348(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 152(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 156(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 408(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 412(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 280(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 284(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 232(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 236(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 488(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 492(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 104(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 108(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 360(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 364(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 168(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 172(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 424(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 428(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 40(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 44(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 296(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 300(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 200(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 204(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 456(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 460(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 72(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 76(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 328(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 332(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 136(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 140(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 392(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 396(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 264(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 268(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 240(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 244(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 496(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 500(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 112(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 116(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 368(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 372(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 176(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 180(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 432(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 436(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 52(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 304(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 308(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 208(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 212(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 464(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 468(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 80(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 84(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 336(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 340(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 144(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 148(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 400(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 404(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 272(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 276(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 224(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 228(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 480(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 484(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 100(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 352(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 356(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 160(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 164(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 416(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 420(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 32(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 288(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 292(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 192(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 196(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 448(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 452(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 64(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 68(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 320(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 324(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 128(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 132(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl 256(%esi), %edi
+; X86-NEXT: movl 260(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 388(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl 4(%esi), %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shrdl $1, %eax, %edi
+; X86-NEXT: shrl %eax
+; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: notb %cl
+; X86-NEXT: shrdl %cl, %eax, %edi
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: movb $32, %cl
+; X86-NEXT: testb %cl, %cl
+; X86-NEXT: movl (%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ebp), %eax
+; X86-NEXT: jne .LBB20_2
+; X86-NEXT: # %bb.1:
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: .LBB20_2:
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 320(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 64(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 448(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 192(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 288(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 32(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 416(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 160(%eax), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 352(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 96(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 480(%eax), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 224(%eax), %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 272(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 16(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 400(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 144(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 336(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 80(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 464(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 208(%eax), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 304(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 48(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 432(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 176(%eax), %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 368(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 112(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 496(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: andl 240(%eax), %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %esi, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 264(%eax), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 8(%eax), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 392(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 136(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: orl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 328(%ebx), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 72(%ebx), %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 456(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 200(%ebx), %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 296(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 40(%ebx), %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 424(%ebx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 168(%ebx), %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 360(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 104(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 488(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 232(%ebx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 280(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 24(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 408(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 152(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 344(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 88(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 472(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 216(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 312(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 56(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 440(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 184(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 376(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 120(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 504(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 248(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 324(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 68(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 452(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 196(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 292(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 36(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 420(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 164(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 356(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 100(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 484(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 228(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 276(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 20(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 404(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 148(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 340(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 84(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 468(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 212(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 308(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 52(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 436(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 180(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 372(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 116(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 500(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 244(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: orl %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 268(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 12(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 396(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 140(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 332(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 76(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 460(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 204(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 300(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 44(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 428(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 172(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 364(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 108(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 492(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: andl 236(%ebx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 284(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 28(%ebx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 412(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 156(%ebx), %edi
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 348(%ebx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 92(%ebx), %edx
+; X86-NEXT: orl %eax, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 476(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 220(%ebx), %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 316(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 60(%ebx), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 444(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: andl 188(%ebx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 380(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: andl 124(%ebx), %edx
+; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 508(%ebx), %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: andl 252(%esi), %ebx
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: orl %edx, %ebx
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: orl %eax, %ebx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: negl %ecx
+; X86-NEXT: movl 1648(%esp,%ecx), %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl %cl, %edi, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: shldl %cl, %edx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl 8(%ebp), %edx
+; X86-NEXT: andl 128(%edx), %ecx
+; X86-NEXT: andl 384(%edx), %edi
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shll %cl, %eax
+; X86-NEXT: andl (%edx), %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 256(%edx), %eax
+; X86-NEXT: orl %eax, %edi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 260(%edx), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: andl 4(%edx), %ecx
+; X86-NEXT: orl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: andl 132(%edx), %eax
+; X86-NEXT: andl 388(%edx), %esi
+; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: orl %ebx, %esi
+; X86-NEXT: orl %edi, %esi
+; X86-NEXT: setne %al
+; X86-NEXT: leal -12(%ebp), %esp
+; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
+; X86-NEXT: popl %ebx
+; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: test_ne_i4096:
-; X64: # %bb.0:
-; X64-NEXT: movl %esi, %eax
-; X64-NEXT: andl $4064, %eax # imm = 0xFE0
-; X64-NEXT: shrl $3, %eax
-; X64-NEXT: movl (%rdi,%rax), %eax
-; X64-NEXT: btl %esi, %eax
-; X64-NEXT: setb %al
-; X64-NEXT: retq
+; SSE-LABEL: test_ne_i4096:
+; SSE: # %bb.0:
+; SSE-NEXT: pushq %rbp
+; SSE-NEXT: pushq %r15
+; SSE-NEXT: pushq %r14
+; SSE-NEXT: pushq %r13
+; SSE-NEXT: pushq %r12
+; SSE-NEXT: pushq %rbx
+; SSE-NEXT: subq $1576, %rsp # imm = 0x628
+; SSE-NEXT: movl %esi, %ecx
+; SSE-NEXT: movl %esi, %eax
+; SSE-NEXT: andl $4032, %eax # imm = 0xFC0
+; SSE-NEXT: xorps %xmm0, %xmm0
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movups %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $0, {{[0-9]+}}(%rsp)
+; SSE-NEXT: movq $1, {{[0-9]+}}(%rsp)
+; SSE-NEXT: andl $63, %ecx
+; SSE-NEXT: shrl $3, %eax
+; SSE-NEXT: negl %eax
+; SSE-NEXT: movslq %eax, %rsi
+; SSE-NEXT: movq 1296(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1304(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1552(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1560(%rsp,%rsi), %rax
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1168(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1176(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1424(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1432(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1232(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1240(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1488(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1496(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1104(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1112(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1360(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; SSE-NEXT: movq 1368(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1264(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1272(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1520(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1528(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1136(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1144(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1392(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1400(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1200(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1208(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1456(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1464(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1072(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1080(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1328(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1336(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1280(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1288(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1536(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1544(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1152(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1160(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1408(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1416(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1216(%rsp,%rsi), %r11
+; SSE-NEXT: movq 1224(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %r11, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1472(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1480(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1088(%rsp,%rsi), %r9
+; SSE-NEXT: movq 1096(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %r9, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1344(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1352(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1248(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1256(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1504(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1512(%rsp,%rsi), %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rdx, %rax
+; SSE-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1120(%rsp,%rsi), %rax
+; SSE-NEXT: movq 1128(%rsp,%rsi), %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: shldq %cl, %rax, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1376(%rsp,%rsi), %r13
+; SSE-NEXT: movq 1384(%rsp,%rsi), %rbx
+; SSE-NEXT: movq %rbx, %r8
+; SSE-NEXT: shldq %cl, %r13, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1184(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1192(%rsp,%rsi), %r15
+; SSE-NEXT: movq %r15, %r14
+; SSE-NEXT: shldq %cl, %rdx, %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1440(%rsp,%rsi), %r10
+; SSE-NEXT: movq 1448(%rsp,%rsi), %rdx
+; SSE-NEXT: movq %rdx, %r14
+; SSE-NEXT: shldq %cl, %r10, %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1312(%rsp,%rsi), %r14
+; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq 1320(%rsp,%rsi), %rbp
+; SSE-NEXT: movq %rbp, %r12
+; SSE-NEXT: shldq %cl, %r14, %r12
+; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, (%rsp) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq 1064(%rsp,%rsi), %rbx
+; SSE-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rbp, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rdx, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r9
+; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rbp
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r15, %r13
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r12, %r15
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: shldq %cl, %r12, %r10
+; SSE-NEXT: andq 384(%rdi), %r10
+; SSE-NEXT: andq 128(%rdi), %r15
+; SSE-NEXT: andq 320(%rdi), %r13
+; SSE-NEXT: andq 64(%rdi), %rax
+; SSE-NEXT: orq %r10, %r15
+; SSE-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: orq %r13, %rax
+; SSE-NEXT: andq 448(%rdi), %r9
+; SSE-NEXT: andq 192(%rdi), %rbp
+; SSE-NEXT: orq %r9, %rbp
+; SSE-NEXT: orq %rax, %rbp
+; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: andq 288(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 32(%rdi), %r9
+; SSE-NEXT: andq 416(%rdi), %rdx
+; SSE-NEXT: andq 160(%rdi), %r11
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 352(%rdi), %rdx
+; SSE-NEXT: orq %r9, %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 96(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 480(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 224(%rdi), %r8
+; SSE-NEXT: orq %rax, %r8
+; SSE-NEXT: orq %rdx, %r8
+; SSE-NEXT: andq 272(%rdi), %r14
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 16(%rdi), %rax
+; SSE-NEXT: orq %r14, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 400(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 144(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 336(%rdi), %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 80(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 464(%rdi), %rdx
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 208(%rdi), %r11
+; SSE-NEXT: orq %rdx, %r11
+; SSE-NEXT: orq %rax, %r11
+; SSE-NEXT: orq %r8, %r11
+; SSE-NEXT: movq (%rsp), %rdx # 8-byte Reload
+; SSE-NEXT: andq 304(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 48(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 432(%rdi), %r9
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 176(%rdi), %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 368(%rdi), %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 112(%rdi), %rax
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: movq %r8, %r10
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 496(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE-NEXT: andq 240(%rdi), %rbp
+; SSE-NEXT: orq %r8, %rbp
+; SSE-NEXT: orq %rax, %rbp
+; SSE-NEXT: orq %r10, %rbp
+; SSE-NEXT: orq %r11, %rbp
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 392(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE-NEXT: andq 136(%rdi), %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 328(%rdi), %rdx
+; SSE-NEXT: orq %rax, %r12
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 72(%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 456(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE-NEXT: andq 200(%rdi), %r13
+; SSE-NEXT: orq %rax, %r13
+; SSE-NEXT: orq %rdx, %r13
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 296(%rdi), %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 40(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 424(%rdi), %r8
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: movq %rax, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: andq 168(%rdi), %rdx
+; SSE-NEXT: orq %r8, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 360(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 104(%rdi), %rax
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 488(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE-NEXT: andq 232(%rdi), %r15
+; SSE-NEXT: orq %rax, %r15
+; SSE-NEXT: orq %r8, %r15
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 280(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 24(%rdi), %rax
+; SSE-NEXT: orq %rdx, %r15
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 408(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 152(%rdi), %rax
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 344(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 88(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 472(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE-NEXT: andq 216(%rdi), %r14
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: orq %rax, %r14
+; SSE-NEXT: orq %r8, %r14
+; SSE-NEXT: orq %r10, %r14
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 312(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT: andq 56(%rdi), %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 440(%rdi), %r8
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; SSE-NEXT: andq 184(%rdi), %r9
+; SSE-NEXT: orq %r11, %r10
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; SSE-NEXT: shldq %cl, %rax, %rdx
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE-NEXT: andq 376(%rdi), %r10
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE-NEXT: andq 120(%rdi), %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE-NEXT: andq 504(%rdi), %r11
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE-NEXT: andq 248(%rdi), %r8
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: movq 1056(%rsp,%rsi), %rax
+; SSE-NEXT: shldq %cl, %rax, %rbx
+; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT: shlq %cl, %rax
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: andq 256(%rdi), %rdx
+; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: andq (%rdi), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; SSE-NEXT: orq %rbp, %rax
+; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE-NEXT: andq 264(%rdi), %rcx
+; SSE-NEXT: andq 8(%rdi), %rbx
+; SSE-NEXT: orq %rcx, %rbx
+; SSE-NEXT: orq %r12, %rbx
+; SSE-NEXT: orq %r13, %rbx
+; SSE-NEXT: orq %r15, %rbx
+; SSE-NEXT: orq %r8, %rbx
+; SSE-NEXT: orq %rax, %rbx
+; SSE-NEXT: setne %al
+; SSE-NEXT: addq $1576, %rsp # imm = 0x628
+; SSE-NEXT: popq %rbx
+; SSE-NEXT: popq %r12
+; SSE-NEXT: popq %r13
+; SSE-NEXT: popq %r14
+; SSE-NEXT: popq %r15
+; SSE-NEXT: popq %rbp
+; SSE-NEXT: retq
+;
+; AVX2-LABEL: test_ne_i4096:
+; AVX2: # %bb.0:
+; AVX2-NEXT: pushq %rbp
+; AVX2-NEXT: pushq %r15
+; AVX2-NEXT: pushq %r14
+; AVX2-NEXT: pushq %r13
+; AVX2-NEXT: pushq %r12
+; AVX2-NEXT: pushq %rbx
+; AVX2-NEXT: subq $1560, %rsp # imm = 0x618
+; AVX2-NEXT: movl %esi, %ecx
+; AVX2-NEXT: movl %esi, %eax
+; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $4032, %eax # imm = 0xFC0
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX2-NEXT: andl $63, %ecx
+; AVX2-NEXT: shrl $3, %eax
+; AVX2-NEXT: negl %eax
+; AVX2-NEXT: movslq %eax, %rsi
+; AVX2-NEXT: movq 1280(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1288(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1536(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1544(%rsp,%rsi), %rax
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1152(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1160(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1408(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1416(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1216(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; AVX2-NEXT: movq 1224(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1472(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1480(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1088(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1096(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1344(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1352(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1248(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1256(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1504(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1512(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1120(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1128(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1376(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1384(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1184(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1192(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1440(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1448(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1056(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1064(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1312(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1320(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1264(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1272(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1520(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1528(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1136(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1144(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1392(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1400(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1200(%rsp,%rsi), %r11
+; AVX2-NEXT: movq 1208(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %r11, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1456(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1464(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1072(%rsp,%rsi), %r12
+; AVX2-NEXT: movq 1080(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %r12, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1328(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1336(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rdx, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1232(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1240(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1488(%rsp,%rsi), %rbp
+; AVX2-NEXT: movq 1496(%rsp,%rsi), %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rbp, %rax
+; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1104(%rsp,%rsi), %rax
+; AVX2-NEXT: movq 1112(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: shldq %cl, %rax, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1360(%rsp,%rsi), %r10
+; AVX2-NEXT: movq 1368(%rsp,%rsi), %r8
+; AVX2-NEXT: movq %r8, %rdx
+; AVX2-NEXT: shldq %cl, %r10, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1168(%rsp,%rsi), %r9
+; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1176(%rsp,%rsi), %rbx
+; AVX2-NEXT: movq %rbx, %rdx
+; AVX2-NEXT: shldq %cl, %r9, %rdx
+; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1424(%rsp,%rsi), %r9
+; AVX2-NEXT: movq 1432(%rsp,%rsi), %rdx
+; AVX2-NEXT: movq %rdx, %r14
+; AVX2-NEXT: shldq %cl, %r9, %r14
+; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1296(%rsp,%rsi), %r15
+; AVX2-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq 1304(%rsp,%rsi), %r14
+; AVX2-NEXT: movq %r14, %r13
+; AVX2-NEXT: shldq %cl, %r15, %r13
+; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, (%rsp) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq 1048(%rsp,%rsi), %rdx
+; AVX2-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r14, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r15, %r9
+; AVX2-NEXT: andq 384(%rdi), %r9
+; AVX2-NEXT: andq 128(%rdi), %r14
+; AVX2-NEXT: andq 320(%rdi), %r10
+; AVX2-NEXT: orq %r9, %r14
+; AVX2-NEXT: movq %r14, %r15
+; AVX2-NEXT: andq 64(%rdi), %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: andq 448(%rdi), %rbp
+; AVX2-NEXT: andq 192(%rdi), %r13
+; AVX2-NEXT: orq %rbp, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: andq 288(%rdi), %r8
+; AVX2-NEXT: andq 32(%rdi), %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 416(%rdi), %rax
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: andq 160(%rdi), %r11
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: andq 352(%rdi), %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 96(%rdi), %rax
+; AVX2-NEXT: orq %r12, %r11
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 480(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT: andq 224(%rdi), %r13
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: orq %rax, %r13
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 272(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 16(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r13
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 400(%rdi), %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 144(%rdi), %rax
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 336(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 80(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 464(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 208(%rdi), %r11
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: orq %r8, %r11
+; AVX2-NEXT: orq %rax, %r11
+; AVX2-NEXT: orq %r9, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 304(%rdi), %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 48(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 432(%rdi), %r10
+; AVX2-NEXT: movq (%rsp), %rax # 8-byte Reload
+; AVX2-NEXT: andq 176(%rdi), %rax
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: movq %r8, %r9
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 368(%rdi), %r8
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 112(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 496(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX2-NEXT: andq 240(%rdi), %r9
+; AVX2-NEXT: orq %r8, %r9
+; AVX2-NEXT: orq %rax, %r9
+; AVX2-NEXT: orq %r10, %r9
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 392(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX2-NEXT: andq 136(%rdi), %rbp
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 328(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 72(%rdi), %rax
+; AVX2-NEXT: orq %r10, %rbp
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 456(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT: andq 200(%rdi), %r12
+; AVX2-NEXT: orq %rax, %r12
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 296(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 40(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 424(%rdi), %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 168(%rdi), %rax
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq %r8, %r10
+; AVX2-NEXT: orq %r11, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 360(%rdi), %r8
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 104(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 488(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: andq 232(%rdi), %r14
+; AVX2-NEXT: orq %rax, %r14
+; AVX2-NEXT: orq %r8, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 280(%rdi), %r8
+; AVX2-NEXT: orq %r10, %r14
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 24(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 408(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 152(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT: andq 344(%rdi), %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 88(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 472(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT: andq 216(%rdi), %rbx
+; AVX2-NEXT: orq %rax, %rbx
+; AVX2-NEXT: orq %r8, %rbx
+; AVX2-NEXT: orq %r10, %rbx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 312(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 56(%rdi), %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 440(%rdi), %r10
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: movq %rax, %r11
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 184(%rdi), %r8
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: andq 376(%rdi), %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 120(%rdi), %rax
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: movq %r8, %r11
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq %rax, %r10
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: andq 504(%rdi), %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT: andq 248(%rdi), %rax
+; AVX2-NEXT: orq %r8, %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: shldq %cl, %r8, %r10
+; AVX2-NEXT: orq %r11, %rax
+; AVX2-NEXT: movq 1040(%rsp,%rsi), %rsi
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq %rax, %r8
+; AVX2-NEXT: shlxq %rcx, %rsi, %rax
+; AVX2-NEXT: andq 256(%rdi), %r10
+; AVX2-NEXT: andq (%rdi), %rax
+; AVX2-NEXT: orq %r10, %rax
+; AVX2-NEXT: orq %r15, %rax
+; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; AVX2-NEXT: orq %r13, %rax
+; AVX2-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT: shldq %cl, %rsi, %rdx
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX2-NEXT: andq 264(%rdi), %rcx
+; AVX2-NEXT: andq 8(%rdi), %rdx
+; AVX2-NEXT: orq %r9, %rax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: orq %rbp, %rdx
+; AVX2-NEXT: orq %r12, %rdx
+; AVX2-NEXT: orq %r14, %rdx
+; AVX2-NEXT: orq %r8, %rdx
+; AVX2-NEXT: orq %rax, %rdx
+; AVX2-NEXT: setne %al
+; AVX2-NEXT: addq $1560, %rsp # imm = 0x618
+; AVX2-NEXT: popq %rbx
+; AVX2-NEXT: popq %r12
+; AVX2-NEXT: popq %r13
+; AVX2-NEXT: popq %r14
+; AVX2-NEXT: popq %r15
+; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: test_ne_i4096:
+; AVX512: # %bb.0:
+; AVX512-NEXT: pushq %rbp
+; AVX512-NEXT: pushq %r15
+; AVX512-NEXT: pushq %r14
+; AVX512-NEXT: pushq %r13
+; AVX512-NEXT: pushq %r12
+; AVX512-NEXT: pushq %rbx
+; AVX512-NEXT: subq $1560, %rsp # imm = 0x618
+; AVX512-NEXT: movl %esi, %ecx
+; AVX512-NEXT: movl %esi, %eax
+; AVX512-NEXT: andl $4032, %eax # imm = 0xFC0
+; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
+; AVX512-NEXT: andl $63, %ecx
+; AVX512-NEXT: shrl $3, %eax
+; AVX512-NEXT: negl %eax
+; AVX512-NEXT: movslq %eax, %rsi
+; AVX512-NEXT: movq 1280(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1288(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1536(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1544(%rsp,%rsi), %rax
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1152(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1160(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1408(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1416(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1216(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, (%rsp) # 8-byte Spill
+; AVX512-NEXT: movq 1224(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1472(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1480(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1088(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1096(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1344(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1352(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1248(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1256(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1504(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1512(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1120(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1128(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1376(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1384(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1184(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1192(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1440(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1448(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1056(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1064(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1312(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1320(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1264(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1272(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1520(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1528(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1136(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1144(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1392(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1400(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1200(%rsp,%rsi), %r10
+; AVX512-NEXT: movq 1208(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r10, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1456(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1464(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1072(%rsp,%rsi), %r14
+; AVX512-NEXT: movq 1080(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r14, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1328(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1336(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rdx, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1232(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1240(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1488(%rsp,%rsi), %r12
+; AVX512-NEXT: movq 1496(%rsp,%rsi), %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %r12, %rax
+; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1104(%rsp,%rsi), %rax
+; AVX512-NEXT: movq 1112(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1360(%rsp,%rsi), %r11
+; AVX512-NEXT: movq 1368(%rsp,%rsi), %rbx
+; AVX512-NEXT: movq %rbx, %rdx
+; AVX512-NEXT: shldq %cl, %r11, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1168(%rsp,%rsi), %r9
+; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1176(%rsp,%rsi), %r8
+; AVX512-NEXT: movq %r8, %rdx
+; AVX512-NEXT: shldq %cl, %r9, %rdx
+; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1424(%rsp,%rsi), %r9
+; AVX512-NEXT: movq 1432(%rsp,%rsi), %rdx
+; AVX512-NEXT: movq %rdx, %r15
+; AVX512-NEXT: shldq %cl, %r9, %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1296(%rsp,%rsi), %rbp
+; AVX512-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq 1304(%rsp,%rsi), %r15
+; AVX512-NEXT: movq %r15, %r13
+; AVX512-NEXT: shldq %cl, %rbp, %r13
+; AVX512-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, (%rsp) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq 1048(%rsp,%rsi), %rdx
+; AVX512-NEXT: shldq %cl, %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %rbx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r13
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %r15, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbp, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rbp, %r9
+; AVX512-NEXT: andq 384(%rdi), %r9
+; AVX512-NEXT: andq 128(%rdi), %r15
+; AVX512-NEXT: orq %r9, %r15
+; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: andq 320(%rdi), %r11
+; AVX512-NEXT: andq 64(%rdi), %rax
+; AVX512-NEXT: orq %r11, %rax
+; AVX512-NEXT: andq 448(%rdi), %r12
+; AVX512-NEXT: andq 192(%rdi), %r13
+; AVX512-NEXT: orq %r12, %r13
+; AVX512-NEXT: orq %rax, %r13
+; AVX512-NEXT: andq 288(%rdi), %r8
+; AVX512-NEXT: andq 32(%rdi), %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 416(%rdi), %rax
+; AVX512-NEXT: orq %r8, %r14
+; AVX512-NEXT: andq 160(%rdi), %r10
+; AVX512-NEXT: orq %rax, %r10
+; AVX512-NEXT: andq 352(%rdi), %rbx
+; AVX512-NEXT: orq %r14, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 96(%rdi), %rax
+; AVX512-NEXT: orq %rbx, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 480(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: andq 224(%rdi), %r15
+; AVX512-NEXT: orq %rax, %r15
+; AVX512-NEXT: orq %r8, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 272(%rdi), %r8
+; AVX512-NEXT: orq %r10, %r15
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 16(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 400(%rdi), %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 144(%rdi), %rax
+; AVX512-NEXT: orq %r9, %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 336(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 80(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 464(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: andq 208(%rdi), %r11
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: orq %r8, %r11
+; AVX512-NEXT: orq %rax, %r11
+; AVX512-NEXT: orq %r9, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 304(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 48(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 432(%rdi), %r9
+; AVX512-NEXT: movq (%rsp), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 176(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: orq %r9, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 368(%rdi), %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 112(%rdi), %rax
+; AVX512-NEXT: orq %r10, %r8
+; AVX512-NEXT: movq %r8, %r10
+; AVX512-NEXT: orq %r9, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 496(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; AVX512-NEXT: andq 240(%rdi), %r9
+; AVX512-NEXT: orq %r8, %r9
+; AVX512-NEXT: orq %rax, %r9
+; AVX512-NEXT: orq %r10, %r9
+; AVX512-NEXT: orq %r11, %r9
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 392(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; AVX512-NEXT: andq 136(%rdi), %rbp
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 328(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 72(%rdi), %rax
+; AVX512-NEXT: orq %r10, %rbp
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 456(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT: andq 200(%rdi), %r12
+; AVX512-NEXT: orq %rax, %r12
+; AVX512-NEXT: orq %r8, %r12
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 296(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 40(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 424(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 168(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 360(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 104(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 488(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT: andq 232(%rdi), %r14
+; AVX512-NEXT: orq %rax, %r14
+; AVX512-NEXT: orq %r8, %r14
+; AVX512-NEXT: orq %r10, %r14
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 280(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 24(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 408(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 152(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT: andq 344(%rdi), %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 88(%rdi), %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 472(%rdi), %rax
+; AVX512-NEXT: orq %r11, %r8
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT: andq 216(%rdi), %rbx
+; AVX512-NEXT: orq %rax, %rbx
+; AVX512-NEXT: orq %r8, %rbx
+; AVX512-NEXT: orq %r10, %rbx
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: andq 312(%rdi), %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 56(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 440(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 184(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 376(%rdi), %r8
+; AVX512-NEXT: orq %r10, %rax
+; AVX512-NEXT: movq %rax, %r11
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 120(%rdi), %rax
+; AVX512-NEXT: orq %r8, %rax
+; AVX512-NEXT: movq %rax, %r10
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 504(%rdi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT: andq 248(%rdi), %r8
+; AVX512-NEXT: orq %rax, %r8
+; AVX512-NEXT: orq %r10, %r8
+; AVX512-NEXT: orq %r11, %r8
+; AVX512-NEXT: movq 1040(%rsp,%rsi), %rax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT: shldq %cl, %rsi, %r10
+; AVX512-NEXT: orq %rbx, %r8
+; AVX512-NEXT: shlxq %rcx, %rax, %rsi
+; AVX512-NEXT: andq 256(%rdi), %r10
+; AVX512-NEXT: andq (%rdi), %rsi
+; AVX512-NEXT: orq %r10, %rsi
+; AVX512-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; AVX512-NEXT: orq %r13, %rsi
+; AVX512-NEXT: orq %r15, %rsi
+; AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; AVX512-NEXT: shldq %cl, %rax, %rdx
+; AVX512-NEXT: orq %r9, %rsi
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: andq 264(%rdi), %rax
+; AVX512-NEXT: andq 8(%rdi), %rdx
+; AVX512-NEXT: orq %rax, %rdx
+; AVX512-NEXT: orq %rbp, %rdx
+; AVX512-NEXT: orq %r12, %rdx
+; AVX512-NEXT: orq %r14, %rdx
+; AVX512-NEXT: orq %r8, %rdx
+; AVX512-NEXT: orq %rsi, %rdx
+; AVX512-NEXT: setne %al
+; AVX512-NEXT: addq $1560, %rsp # imm = 0x618
+; AVX512-NEXT: popq %rbx
+; AVX512-NEXT: popq %r12
+; AVX512-NEXT: popq %r13
+; AVX512-NEXT: popq %r14
+; AVX512-NEXT: popq %r15
+; AVX512-NEXT: popq %rbp
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
%rem = and i32 %position, 4095
%ofs = zext nneg i32 %rem to i4096
%bit = shl nuw i4096 1, %ofs
@@ -1035,8 +7161,8 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: andl $-16, %esp
-; X86-NEXT: subl $64, %esp
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: subl $80, %esp
+; X86-NEXT: movzbl 12(%ebp), %ecx
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
@@ -1049,41 +7175,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movl 36(%esp,%edi), %edx
-; X86-NEXT: movl 40(%esp,%edi), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shldl %cl, %edx, %esi
-; X86-NEXT: movl 32(%esp,%edi), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%esp,%edi), %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: notl %ebx
-; X86-NEXT: movl 16(%ebp), %eax
-; X86-NEXT: movl (%eax), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %eax
-; X86-NEXT: andl $96, %eax
-; X86-NEXT: shrl $3, %eax
-; X86-NEXT: movl 8(%ebp), %ecx
-; X86-NEXT: movl (%ecx,%eax), %eax
-; X86-NEXT: andl %ebx, (%ecx)
-; X86-NEXT: movl 12(%ebp), %ecx
+; X86-NEXT: movsbl %al, %eax
+; X86-NEXT: movl 56(%esp,%eax), %esi
+; X86-NEXT: movl 60(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 48(%esp,%eax), %edi
+; X86-NEXT: movl 52(%esp,%eax), %eax
+; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl %cl, %edi, %eax
+; X86-NEXT: movl 8(%ebp), %ebx
+; X86-NEXT: shll %cl, %edi
+; X86-NEXT: movl 8(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %esi, %ecx
+; X86-NEXT: movl (%ebx), %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %edi, %esi
+; X86-NEXT: orl %ecx, %esi
+; X86-NEXT: movl 12(%ebx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %edx
+; X86-NEXT: movl 4(%ebx), %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: andl %ebx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: notl %ecx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl %cl, %ebx, %edx
+; X86-NEXT: notl %ebx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: notl %edx
-; X86-NEXT: movl 8(%ebp), %ebx
-; X86-NEXT: andl %edx, 4(%ebx)
-; X86-NEXT: notl %esi
-; X86-NEXT: andl %esi, 8(%ebx)
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: notl %edi
-; X86-NEXT: andl %edi, 12(%ebx)
-; X86-NEXT: btl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: jae .LBB22_2
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl 16(%ebp), %eax
+; X86-NEXT: movl (%eax), %eax
+; X86-NEXT: movl 8(%ebp), %esi
+; X86-NEXT: movl %ebx, 8(%esi)
+; X86-NEXT: movl %ecx, 12(%esi)
+; X86-NEXT: movl %edi, (%esi)
+; X86-NEXT: movl %edx, 4(%esi)
+; X86-NEXT: je .LBB22_2
; X86-NEXT: # %bb.1:
; X86-NEXT: xorl %eax, %eax
; X86-NEXT: .LBB22_2:
@@ -1105,75 +7242,52 @@ define i32 @reset_multiload_i128(ptr %word, i32 %position, ptr %p) nounwind {
; SSE-NEXT: testb $64, %cl
; SSE-NEXT: cmovneq %rsi, %r8
; SSE-NEXT: cmovneq %rax, %rsi
+; SSE-NEXT: movq (%rdi), %rcx
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: andq %r8, %r10
; SSE-NEXT: notq %r8
+; SSE-NEXT: movq %rcx, %r11
+; SSE-NEXT: andq %rsi, %r11
; SSE-NEXT: notq %rsi
-; SSE-NEXT: movl %ecx, %r9d
-; SSE-NEXT: andl $96, %r9d
-; SSE-NEXT: shrl $3, %r9d
-; SSE-NEXT: movl (%rdi,%r9), %r9d
-; SSE-NEXT: btl %ecx, %r9d
-; SSE-NEXT: jb .LBB22_2
+; SSE-NEXT: andq %r9, %r8
+; SSE-NEXT: andq %rcx, %rsi
+; SSE-NEXT: orq %r10, %r11
+; SSE-NEXT: jne .LBB22_2
; SSE-NEXT: # %bb.1:
; SSE-NEXT: movl (%rdx), %eax
; SSE-NEXT: .LBB22_2:
-; SSE-NEXT: andq %r8, 8(%rdi)
-; SSE-NEXT: andq %rsi, (%rdi)
+; SSE-NEXT: movq %rsi, (%rdi)
+; SSE-NEXT: movq %r8, 8(%rdi)
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: retq
;
-; AVX2-LABEL: reset_multiload_i128:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %esi, %ecx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movl $1, %r8d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: shldq %cl, %r8, %rsi
-; AVX2-NEXT: shlxq %rcx, %r8, %r8
-; AVX2-NEXT: testb $64, %cl
-; AVX2-NEXT: cmovneq %r8, %rsi
-; AVX2-NEXT: cmovneq %rax, %r8
-; AVX2-NEXT: notq %rsi
-; AVX2-NEXT: notq %r8
-; AVX2-NEXT: movl %ecx, %r9d
-; AVX2-NEXT: andl $96, %r9d
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: movl (%rdi,%r9), %r9d
-; AVX2-NEXT: btl %ecx, %r9d
-; AVX2-NEXT: jb .LBB22_2
-; AVX2-NEXT: # %bb.1:
-; AVX2-NEXT: movl (%rdx), %eax
-; AVX2-NEXT: .LBB22_2:
-; AVX2-NEXT: andq %rsi, 8(%rdi)
-; AVX2-NEXT: andq %r8, (%rdi)
-; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: retq
-;
-; AVX512-LABEL: reset_multiload_i128:
-; AVX512: # %bb.0:
-; AVX512-NEXT: movl %esi, %ecx
-; AVX512-NEXT: movl $1, %r8d
-; AVX512-NEXT: xorl %esi, %esi
-; AVX512-NEXT: shldq %cl, %r8, %rsi
-; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: shlxq %rcx, %r8, %r8
-; AVX512-NEXT: testb $64, %cl
-; AVX512-NEXT: cmovneq %r8, %rsi
-; AVX512-NEXT: cmovneq %rax, %r8
-; AVX512-NEXT: notq %rsi
-; AVX512-NEXT: notq %r8
-; AVX512-NEXT: movl %ecx, %r9d
-; AVX512-NEXT: andl $96, %r9d
-; AVX512-NEXT: shrl $3, %r9d
-; AVX512-NEXT: movl (%rdi,%r9), %r9d
-; AVX512-NEXT: btl %ecx, %r9d
-; AVX512-NEXT: jb .LBB22_2
-; AVX512-NEXT: # %bb.1:
-; AVX512-NEXT: movl (%rdx), %eax
-; AVX512-NEXT: .LBB22_2:
-; AVX512-NEXT: andq %rsi, 8(%rdi)
-; AVX512-NEXT: andq %r8, (%rdi)
-; AVX512-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX512-NEXT: retq
+; AVX-LABEL: reset_multiload_i128:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %esi, %ecx
+; AVX-NEXT: movl $1, %esi
+; AVX-NEXT: xorl %r8d, %r8d
+; AVX-NEXT: shldq %cl, %rsi, %r8
+; AVX-NEXT: xorl %eax, %eax
+; AVX-NEXT: shlxq %rcx, %rsi, %r9
+; AVX-NEXT: testb $64, %cl
+; AVX-NEXT: cmovneq %r9, %r8
+; AVX-NEXT: cmovneq %rax, %r9
+; AVX-NEXT: movq (%rdi), %r10
+; AVX-NEXT: movq 8(%rdi), %r11
+; AVX-NEXT: andnq %r11, %r8, %rcx
+; AVX-NEXT: andq %r8, %r11
+; AVX-NEXT: andnq %r10, %r9, %rsi
+; AVX-NEXT: andq %r9, %r10
+; AVX-NEXT: orq %r11, %r10
+; AVX-NEXT: jne .LBB22_2
+; AVX-NEXT: # %bb.1:
+; AVX-NEXT: movl (%rdx), %eax
+; AVX-NEXT: .LBB22_2:
+; AVX-NEXT: movq %rsi, (%rdi)
+; AVX-NEXT: movq %rcx, 8(%rdi)
+; AVX-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX-NEXT: retq
%rem = and i32 %position, 127
%ofs = zext nneg i32 %rem to i128
%bit = shl nuw i128 1, %ofs
diff --git a/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
new file mode 100755
index 0000000..aa3f6dc
--- /dev/null
+++ b/llvm/test/DebugInfo/PDB/Native/pdb-native-index-overflow.test
@@ -0,0 +1,13 @@
+; Test that the native PDB reader isn't crashed by index value bigger than
+; number of types in TPI or IPI stream
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --type-index=20000000\
+; RUN: | FileCheck -check-prefixes=TYPES,NOT_FOUND %s
+; RUN: llvm-pdbutil dump %p/../Inputs/empty.pdb --id-index=20000000\
+; RUN: | FileCheck -check-prefixes=IDS,NOT_FOUND %s
+
+TYPES: Types (TPI Stream)
+IDS: Types (IPI Stream)
+NOT_FOUND:============================================================
+NOT_FOUND: Showing 1 records.
+NOT_FOUND: Type 0x1312D00 doesn't exist in TPI stream
+
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
deleted file mode 100644
index 9fcac80..0000000
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt-inseltpoison.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-
-define void @test (float %b, ptr %p) {
-; CHECK: extractelement
-; CHECK: fptosi
- %1 = load <8 x float> , ptr %p
- %2 = bitcast <8 x float> %1 to <8 x i32>
- %3 = bitcast <8 x i32> %2 to <8 x float>
- %a = fptosi <8 x float> %3 to <8 x i32>
- %4 = fptosi float %b to i32
- %5 = add i32 %4, -2
- %6 = extractelement <8 x i32> %a, i32 %5
- %7 = insertelement <8 x i32> poison, i32 %6, i32 7
- %8 = sitofp <8 x i32> %7 to <8 x float>
- store <8 x float> %8, ptr %p
- ret void
-}
-
-; PR18600
-define i32 @test2(i32 %i) {
- %e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
- ret i32 %e
-
-; CHECK-LABEL: @test2
-; CHECK: extractelement
-}
diff --git a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
index 32bf4da..205b4b8 100644
--- a/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
+++ b/llvm/test/Transforms/InstCombine/vec_extract_var_elt.ll
@@ -1,26 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-define void @test (float %b, ptr %p) {
-; CHECK: extractelement
-; CHECK: fptosi
- %1 = load <8 x float> , ptr %p
+define void @test_poison(float %b, ptr %p) {
+; CHECK-LABEL: define void @test_poison(
+; CHECK-SAME: float [[B:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[P]], align 32
+; CHECK-NEXT: [[TMP2:%.*]] = fptosi float [[B]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], -2
+; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP1]], i32 [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = fptosi float [[TMP4]] to i32
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> poison, i32 [[TMP5]], i64 7
+; CHECK-NEXT: [[TMP7:%.*]] = sitofp <8 x i32> [[TMP6]] to <8 x float>
+; CHECK-NEXT: store <8 x float> [[TMP7]], ptr [[P]], align 32
+; CHECK-NEXT: ret void
+;
+ %1 = load <8 x float>, ptr %p
%2 = bitcast <8 x float> %1 to <8 x i32>
%3 = bitcast <8 x i32> %2 to <8 x float>
%a = fptosi <8 x float> %3 to <8 x i32>
%4 = fptosi float %b to i32
%5 = add i32 %4, -2
%6 = extractelement <8 x i32> %a, i32 %5
- %7 = insertelement <8 x i32> undef, i32 %6, i32 7
+ %7 = insertelement <8 x i32> poison, i32 %6, i32 7
%8 = sitofp <8 x i32> %7 to <8 x float>
store <8 x float> %8, ptr %p
- ret void
+ ret void
}
; PR18600
-define i32 @test2(i32 %i) {
+define i32 @test_bitcast(i32 %i) {
+; CHECK-LABEL: define i32 @test_bitcast(
+; CHECK-SAME: i32 [[I:%.*]]) {
+; CHECK-NEXT: [[E:%.*]] = extractelement <4 x i32> <i32 1, i32 0, i32 2, i32 0>, i32 [[I]]
+; CHECK-NEXT: ret i32 [[E]]
+;
%e = extractelement <4 x i32> bitcast (<2 x i64> <i64 1, i64 2> to <4 x i32>), i32 %i
ret i32 %e
+}
+
+declare void @use(i32)
-; CHECK-LABEL: @test2
-; CHECK: extractelement
+define void @test_loop(<4 x float> %in) {
+; CHECK-LABEL: define void @test_loop(
+; CHECK-SAME: <4 x float> [[IN:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[R:%.*]] = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> [[IN]], i32 9)
+; CHECK-NEXT: br label %[[LOOP:.*]]
+; CHECK: [[LOOP]]:
+; CHECK-NEXT: [[I:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT: [[COND:%.*]] = icmp samesign ult i32 [[I]], 4
+; CHECK-NEXT: br i1 [[COND]], label %[[BODY:.*]], label %[[DONE:.*]]
+; CHECK: [[BODY]]:
+; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[R]], i32 [[I]]
+; CHECK-NEXT: [[ELEM:%.*]] = fptosi float [[TMP0]] to i32
+; CHECK-NEXT: call void @use(i32 [[ELEM]])
+; CHECK-NEXT: br label %[[LATCH]]
+; CHECK: [[LATCH]]:
+; CHECK-NEXT: [[NEXT]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT: br label %[[LOOP]]
+; CHECK: [[DONE]]:
+; CHECK-NEXT: ret void
+;
+entry:
+ %r = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %in, i32 9)
+ %vi = fptosi <4 x float> %r to <4 x i32>
+ br label %loop
+loop:
+ %i = phi i32 [ 0, %entry ], [ %next, %latch ]
+ %cond = icmp ult i32 %i, 4
+ br i1 %cond, label %body, label %done
+body:
+ %elem = extractelement <4 x i32> %vi, i32 %i
+ call void @use(i32 %elem)
+ br label %latch
+latch:
+ %next = add i32 %i, 1
+ br label %loop
+done:
+ ret void
}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
new file mode 100644
index 0000000..bfd216d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+entry:
+ br label %block1
+
+block1:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %block2, label %exit1
+
+block2:
+ br i1 %cmp, label %block3, label %exit2
+
+block3:
+ br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+ ret i32 0
+
+exit2:
+ ret i32 %x
+
+exit3:
+ ret i32 %x
+
+exit4:
+ ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
new file mode 100644
index 0000000..c5f822d
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/check_empty.ll.expected
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 7
+; RUN: opt < %s -S | FileCheck %s
+
+; Test whether UTC checks empty lines instead of skipping them.
+define i32 @test(i32 %x) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*:]]
+; CHECK-NEXT: br label %[[BLOCK1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[BLOCK1]]:
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT: br i1 [[CMP]], label %[[BLOCK2:.*]], label %[[EXIT1:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[BLOCK2]]:
+; CHECK-NEXT: br i1 [[CMP]], label %[[BLOCK3:.*]], label %[[EXIT2:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[BLOCK3]]:
+; CHECK-NEXT: br i1 [[CMP]], label %[[EXIT3:.*]], label %[[EXIT4:.*]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[EXIT1]]:
+; CHECK-NEXT: ret i32 0
+; CHECK-EMPTY:
+; CHECK-NEXT: [[EXIT2]]:
+; CHECK-NEXT: ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[EXIT3]]:
+; CHECK-NEXT: ret i32 [[X]]
+; CHECK-EMPTY:
+; CHECK-NEXT: [[EXIT4]]:
+; CHECK-NEXT: ret i32 [[X]]
+;
+entry:
+ br label %block1
+
+block1:
+ %cmp = icmp eq i32 %x, 0
+ br i1 %cmp, label %block2, label %exit1
+
+block2:
+ br i1 %cmp, label %block3, label %exit2
+
+block3:
+ br i1 %cmp, label %exit3, label %exit4
+
+exit1:
+ ret i32 0
+
+exit2:
+ ret i32 %x
+
+exit3:
+ ret i32 %x
+
+exit4:
+ ret i32 %x
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
index b1977e7..8cab0bb 100644
--- a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/switch_case.ll.expected
@@ -12,13 +12,17 @@ define i8 @testi8(i8 %x) {
; CHECK-NEXT: i8 2, label %[[CASE3:.*]]
; CHECK-NEXT: i8 3, label %[[CASE3]]
; CHECK-NEXT: ]
-; CHECK: [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[DEFAULT]]:
; CHECK-NEXT: ret i8 0
-; CHECK: [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE1]]:
; CHECK-NEXT: ret i8 1
-; CHECK: [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE2]]:
; CHECK-NEXT: ret i8 2
-; CHECK: [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE3]]:
; CHECK-NEXT: ret i8 3
;
switch i8 %x, label %default [
@@ -46,13 +50,17 @@ define i32 @testi32(i32 %x) {
; CHECK-NEXT: i32 2, label %[[CASE3:.*]]
; CHECK-NEXT: i32 3, label %[[CASE3]]
; CHECK-NEXT: ]
-; CHECK: [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[DEFAULT]]:
; CHECK-NEXT: ret i32 0
-; CHECK: [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE1]]:
; CHECK-NEXT: ret i32 1
-; CHECK: [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE2]]:
; CHECK-NEXT: ret i32 2
-; CHECK: [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE3]]:
; CHECK-NEXT: ret i32 3
;
switch i32 %x, label %default [
@@ -80,13 +88,17 @@ define i128 @testi128(i128 %x) {
; CHECK-NEXT: i128 2, label %[[CASE3:.*]]
; CHECK-NEXT: i128 3, label %[[CASE3]]
; CHECK-NEXT: ]
-; CHECK: [[DEFAULT]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[DEFAULT]]:
; CHECK-NEXT: ret i128 0
-; CHECK: [[CASE1]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE1]]:
; CHECK-NEXT: ret i128 1
-; CHECK: [[CASE2]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE2]]:
; CHECK-NEXT: ret i128 2
-; CHECK: [[CASE3]]:
+; CHECK-EMPTY:
+; CHECK-NEXT: [[CASE3]]:
; CHECK-NEXT: ret i128 3
;
switch i128 %x, label %default [
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
new file mode 100644
index 0000000..670bda2
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/check_empty.test
@@ -0,0 +1,3 @@
+## test whether the UTC generates CHECK-EMPTY for blank lines
+# RUN: cp -f %S/Inputs/check_empty.ll %t.ll && %update_test_checks %t.ll --version 7
+# RUN: diff -u %t.ll %S/Inputs/check_empty.ll.expected
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index b06aa25..7b563d7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_llvm_unittest(OrcJITTests
IndirectionUtilsTest.cpp
JITTargetMachineBuilderTest.cpp
LazyCallThroughAndReexportsTest.cpp
+ LibraryResolverTest.cpp
LookupAndRecordAddrsTest.cpp
MachOPlatformTest.cpp
MapperJITLinkMemoryManagerTest.cpp
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
new file mode 100644
index 0000000..afd1d9e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .rela.plt
+ Align: 0x1000
+ Offset: 0x0
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ FirstSec: .init
+ LastSec: .fini
+ VAddr: 0x1000
+ Align: 0x1000
+ Offset: 0x1000
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .rodata
+ LastSec: .eh_frame
+ VAddr: 0x2000
+ Align: 0x1000
+ Offset: 0x2000
+ - Type: PT_LOAD
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .init_array
+ LastSec: .bss
+ VAddr: 0x3E10
+ Align: 0x1000
+ Offset: 0x2E10
+ - Type: PT_DYNAMIC
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .dynamic
+ LastSec: .dynamic
+ VAddr: 0x3E20
+ Align: 0x8
+ Offset: 0x2E20
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+ VAddr: 0x2C8
+ Align: 0x4
+ Offset: 0x2C8
+ - Type: PT_GNU_PROPERTY
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_GNU_EH_FRAME
+ Flags: [ PF_R ]
+ FirstSec: .eh_frame_hdr
+ LastSec: .eh_frame_hdr
+ VAddr: 0x2010
+ Align: 0x4
+ Offset: 0x2010
+ - Type: PT_GNU_STACK
+ Flags: [ PF_W, PF_R ]
+ Align: 0x10
+ Offset: 0x0
+ - Type: PT_GNU_RELRO
+ Flags: [ PF_R ]
+ FirstSec: .init_array
+ LastSec: .got
+ VAddr: 0x3E10
+ Offset: 0x2E10
+Sections:
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2A8
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 020000C0040000000300000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2C8
+ AddressAlign: 0x4
+ Notes:
+ - Name: GNU
+ Desc: 73604396C95840D5C380A0950F085A778F94EE7C
+ Type: NT_PRPSINFO
+ - Name: .gnu.hash
+ Type: SHT_GNU_HASH
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2F0
+ Link: .dynsym
+ AddressAlign: 0x8
+ Header:
+ SymNdx: 0x6
+ Shift2: 0x6
+ BloomFilter: [ 0x400000080000 ]
+ HashBuckets: [ 0x0, 0x6 ]
+ HashValues: [ 0x7C9DCB93 ]
+ - Name: .dynsym
+ Type: SHT_DYNSYM
+ Flags: [ SHF_ALLOC ]
+ Address: 0x318
+ Link: .dynstr
+ AddressAlign: 0x8
+ - Name: .dynstr
+ Type: SHT_STRTAB
+ Flags: [ SHF_ALLOC ]
+ Address: 0x3C0
+ AddressAlign: 0x1
+ - Name: .gnu.version
+ Type: SHT_GNU_versym
+ Flags: [ SHF_ALLOC ]
+ Address: 0x436
+ Link: .dynsym
+ AddressAlign: 0x2
+ Entries: [ 0, 1, 2, 1, 1, 2, 1 ]
+ - Name: .gnu.version_r
+ Type: SHT_GNU_verneed
+ Flags: [ SHF_ALLOC ]
+ Address: 0x448
+ Link: .dynstr
+ AddressAlign: 0x8
+ Dependencies:
+ - Version: 1
+ File: libc.so.6
+ Entries:
+ - Name: GLIBC_2.2.5
+ Hash: 157882997
+ Flags: 0
+ Other: 2
+ - Name: .rela.dyn
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC ]
+ Address: 0x468
+ Link: .dynsym
+ AddressAlign: 0x8
+ Relocations:
+ - Offset: 0x3E10
+ Type: R_X86_64_RELATIVE
+ Addend: 4368
+ - Offset: 0x3E18
+ Type: R_X86_64_RELATIVE
+ Addend: 4304
+ - Offset: 0x4020
+ Type: R_X86_64_RELATIVE
+ Addend: 16416
+ - Offset: 0x3FE0
+ Symbol: _ITM_deregisterTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FE8
+ Symbol: __gmon_start__
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF0
+ Symbol: _ITM_registerTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF8
+ Symbol: __cxa_finalize
+ Type: R_X86_64_GLOB_DAT
+ - Name: .rela.plt
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC, SHF_INFO_LINK ]
+ Address: 0x510
+ Link: .dynsym
+ AddressAlign: 0x8
+ Info: .got.plt
+ Relocations:
+ - Offset: 0x4018
+ Symbol: puts
+ Type: R_X86_64_JUMP_SLOT
+ - Name: .init
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x4
+ Offset: 0x1000
+ Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+ - Name: .plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1020
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+ - Name: .plt.got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1040
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25AD2F00000F1F440000
+ - Name: .plt.sec
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1050
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25BD2F00000F1F440000
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1060
+ AddressAlign: 0x10
+ Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+ - Name: .fini
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1134
+ AddressAlign: 0x4
+ Content: F30F1EFA4883EC084883C408C3
+ - Name: .rodata
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2000
+ AddressAlign: 0x1
+ Offset: 0x2000
+ Content: 48656C6C6F2066726F6D204100
+ - Name: .eh_frame_hdr
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2010
+ AddressAlign: 0x4
+ Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+ - Name: .eh_frame
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2040
+ AddressAlign: 0x8
+ Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+ - Name: .init_array
+ Type: SHT_INIT_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E10
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Offset: 0x2E10
+ Content: '1011000000000000'
+ - Name: .fini_array
+ Type: SHT_FINI_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E18
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: D010000000000000
+ - Name: .dynamic
+ Type: SHT_DYNAMIC
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E20
+ Link: .dynstr
+ AddressAlign: 0x8
+ Entries:
+ - Tag: DT_NEEDED
+ Value: 0x5F
+ - Tag: DT_INIT
+ Value: 0x1000
+ - Tag: DT_FINI
+ Value: 0x1134
+ - Tag: DT_INIT_ARRAY
+ Value: 0x3E10
+ - Tag: DT_INIT_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_FINI_ARRAY
+ Value: 0x3E18
+ - Tag: DT_FINI_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_GNU_HASH
+ Value: 0x2F0
+ - Tag: DT_STRTAB
+ Value: 0x3C0
+ - Tag: DT_SYMTAB
+ Value: 0x318
+ - Tag: DT_STRSZ
+ Value: 0x75
+ - Tag: DT_SYMENT
+ Value: 0x18
+ - Tag: DT_PLTGOT
+ Value: 0x4000
+ - Tag: DT_PLTRELSZ
+ Value: 0x18
+ - Tag: DT_PLTREL
+ Value: 0x7
+ - Tag: DT_JMPREL
+ Value: 0x510
+ - Tag: DT_RELA
+ Value: 0x468
+ - Tag: DT_RELASZ
+ Value: 0xA8
+ - Tag: DT_RELAENT
+ Value: 0x18
+ - Tag: DT_VERNEED
+ Value: 0x448
+ - Tag: DT_VERNEEDNUM
+ Value: 0x1
+ - Tag: DT_VERSYM
+ Value: 0x436
+ - Tag: DT_RELACOUNT
+ Value: 0x3
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Name: .got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3FE0
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '0000000000000000000000000000000000000000000000000000000000000000'
+ - Name: .got.plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4000
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '203E000000000000000000000000000000000000000000003010000000000000'
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4020
+ AddressAlign: 0x8
+ Content: '2040000000000000'
+ - Name: .bss
+ Type: SHT_NOBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4028
+ AddressAlign: 0x1
+ Size: 0x8
+ - Name: .comment
+ Type: SHT_PROGBITS
+ Flags: [ SHF_MERGE, SHF_STRINGS ]
+ AddressAlign: 0x1
+ EntSize: 0x1
+ Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+ - Name: crtstuff.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: deregister_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1060
+ - Name: register_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1090
+ - Name: __do_global_dtors_aux
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x10D0
+ - Name: completed.0
+ Type: STT_OBJECT
+ Section: .bss
+ Value: 0x4028
+ Size: 0x1
+ - Name: __do_global_dtors_aux_fini_array_entry
+ Type: STT_OBJECT
+ Section: .fini_array
+ Value: 0x3E18
+ - Name: frame_dummy
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1110
+ - Name: __frame_dummy_init_array_entry
+ Type: STT_OBJECT
+ Section: .init_array
+ Value: 0x3E10
+ - Name: libA.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: 'crtstuff.c (1)'
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: __FRAME_END__
+ Type: STT_OBJECT
+ Section: .eh_frame
+ Value: 0x20D0
+ - Type: STT_FILE
+ Index: SHN_ABS
+ - Name: _fini
+ Type: STT_FUNC
+ Section: .fini
+ Value: 0x1134
+ - Name: __dso_handle
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4020
+ - Name: _DYNAMIC
+ Type: STT_OBJECT
+ Section: .dynamic
+ Value: 0x3E20
+ - Name: __GNU_EH_FRAME_HDR
+ Section: .eh_frame_hdr
+ Value: 0x2010
+ - Name: __TMC_END__
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4028
+ - Name: _GLOBAL_OFFSET_TABLE_
+ Type: STT_OBJECT
+ Section: .got.plt
+ Value: 0x4000
+ - Name: _init
+ Type: STT_FUNC
+ Section: .init
+ Value: 0x1000
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: 'puts@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: sayA
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: '__cxa_finalize@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_WEAK
+DynamicSymbols:
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: puts
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: __cxa_finalize
+ Type: STT_FUNC
+ Binding: STB_WEAK
+ - Name: sayA
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
new file mode 100644
index 0000000..2e851a90
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/A/A_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+ magic: 0xCAFEBABE
+ nfat_arch: 3
+FatArchs:
+ - cputype: 0x1000007
+ cpusubtype: 0x3
+ offset: 0x1000
+ size: 8376
+ align: 12
+ - cputype: 0x100000C
+ cpusubtype: 0x0
+ offset: 0x4000
+ size: 33376
+ align: 14
+ - cputype: 0x100000C
+ cpusubtype: 0x80000002
+ offset: 0x10000
+ size: 33376
+ align: 14
+Slices:
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x1000007
+ cpusubtype: 0x3
+ filetype: 0x6
+ ncmds: 14
+ sizeofcmds: 960
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 4096
+ fileoff: 0
+ filesize: 4096
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0xF80
+ size: 20
+ offset: 0xF80
+ align: 4
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 554889E5488D3D0F000000B000E8020000005DC3
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0xF94
+ size: 6
+ offset: 0xF94
+ align: 1
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x6
+ reserved3: 0x0
+ content: FF2566000000
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0xF9A
+ size: 14
+ offset: 0xF9A
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20410A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0xFA8
+ size: 88
+ offset: 0xFA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 4096
+ vmsize: 4096
+ fileoff: 4096
+ filesize: 4096
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x1000
+ size: 8
+ offset: 0x1000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 8192
+ vmsize: 4096
+ fileoff: 8192
+ filesize: 184
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 8192
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 8288
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 8320
+ nsyms: 2
+ stroff: 8360
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 8352
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: ADFFA141-C3EE-37CD-B1E7-906D69F81BCB
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 8312
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 8320
+ datasize: 0
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayA
+ Flags: 0x0
+ Address: 0xF80
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 3968
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayA
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0xF80 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0,
+ 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x0
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F70
+ size: 28
+ offset: 0x3F70
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0x3F8C
+ size: 12
+ offset: 0x3F8C
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0xC
+ reserved3: 0x0
+ content: 100000B0100240F900021FD6
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20410A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: C45227E0-C6C0-3137-969B-36AABF9D5487
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayA
+ Flags: 0x0
+ Address: 0x3F70
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16240
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayA
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F70 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x80000002
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F68
+ size: 32
+ offset: 0x3F68
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+ - sectname: __auth_stubs
+ segname: __TEXT
+ addr: 0x3F88
+ size: 16
+ offset: 0x3F88
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x10
+ reserved3: 0x0
+ content: 110000B031020091300240F9110A1FD7
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20410A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __auth_got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 00000000000001C0
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: C9DC00C2-E721-365C-9C2D-E9FDB7C838BB
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayA
+ Flags: 0x0
+ Address: 0x3F68
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16232
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayA
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F68 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
new file mode 100644
index 0000000..fe4393e
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .rela.plt
+ Align: 0x1000
+ Offset: 0x0
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ FirstSec: .init
+ LastSec: .fini
+ VAddr: 0x1000
+ Align: 0x1000
+ Offset: 0x1000
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .rodata
+ LastSec: .eh_frame
+ VAddr: 0x2000
+ Align: 0x1000
+ Offset: 0x2000
+ - Type: PT_LOAD
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .init_array
+ LastSec: .bss
+ VAddr: 0x3E10
+ Align: 0x1000
+ Offset: 0x2E10
+ - Type: PT_DYNAMIC
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .dynamic
+ LastSec: .dynamic
+ VAddr: 0x3E20
+ Align: 0x8
+ Offset: 0x2E20
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+ VAddr: 0x2C8
+ Align: 0x4
+ Offset: 0x2C8
+ - Type: PT_GNU_PROPERTY
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_GNU_EH_FRAME
+ Flags: [ PF_R ]
+ FirstSec: .eh_frame_hdr
+ LastSec: .eh_frame_hdr
+ VAddr: 0x2010
+ Align: 0x4
+ Offset: 0x2010
+ - Type: PT_GNU_STACK
+ Flags: [ PF_W, PF_R ]
+ Align: 0x10
+ Offset: 0x0
+ - Type: PT_GNU_RELRO
+ Flags: [ PF_R ]
+ FirstSec: .init_array
+ LastSec: .got
+ VAddr: 0x3E10
+ Offset: 0x2E10
+Sections:
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2A8
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 020000C0040000000300000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2C8
+ AddressAlign: 0x4
+ Notes:
+ - Name: GNU
+ Desc: 6337F7C1BF21A1DE17630C55602EB4CAC50435BB
+ Type: NT_PRPSINFO
+ - Name: .gnu.hash
+ Type: SHT_GNU_HASH
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2F0
+ Link: .dynsym
+ AddressAlign: 0x8
+ Header:
+ SymNdx: 0x6
+ Shift2: 0x6
+ BloomFilter: [ 0x400000100000 ]
+ HashBuckets: [ 0x6, 0x0 ]
+ HashValues: [ 0x7C9DCB95 ]
+ - Name: .dynsym
+ Type: SHT_DYNSYM
+ Flags: [ SHF_ALLOC ]
+ Address: 0x318
+ Link: .dynstr
+ AddressAlign: 0x8
+ - Name: .dynstr
+ Type: SHT_STRTAB
+ Flags: [ SHF_ALLOC ]
+ Address: 0x3C0
+ AddressAlign: 0x1
+ - Name: .gnu.version
+ Type: SHT_GNU_versym
+ Flags: [ SHF_ALLOC ]
+ Address: 0x436
+ Link: .dynsym
+ AddressAlign: 0x2
+ Entries: [ 0, 1, 2, 1, 1, 2, 1 ]
+ - Name: .gnu.version_r
+ Type: SHT_GNU_verneed
+ Flags: [ SHF_ALLOC ]
+ Address: 0x448
+ Link: .dynstr
+ AddressAlign: 0x8
+ Dependencies:
+ - Version: 1
+ File: libc.so.6
+ Entries:
+ - Name: GLIBC_2.2.5
+ Hash: 157882997
+ Flags: 0
+ Other: 2
+ - Name: .rela.dyn
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC ]
+ Address: 0x468
+ Link: .dynsym
+ AddressAlign: 0x8
+ Relocations:
+ - Offset: 0x3E10
+ Type: R_X86_64_RELATIVE
+ Addend: 4368
+ - Offset: 0x3E18
+ Type: R_X86_64_RELATIVE
+ Addend: 4304
+ - Offset: 0x4020
+ Type: R_X86_64_RELATIVE
+ Addend: 16416
+ - Offset: 0x3FE0
+ Symbol: _ITM_deregisterTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FE8
+ Symbol: __gmon_start__
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF0
+ Symbol: _ITM_registerTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF8
+ Symbol: __cxa_finalize
+ Type: R_X86_64_GLOB_DAT
+ - Name: .rela.plt
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC, SHF_INFO_LINK ]
+ Address: 0x510
+ Link: .dynsym
+ AddressAlign: 0x8
+ Info: .got.plt
+ Relocations:
+ - Offset: 0x4018
+ Symbol: puts
+ Type: R_X86_64_JUMP_SLOT
+ - Name: .init
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x4
+ Offset: 0x1000
+ Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+ - Name: .plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1020
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+ - Name: .plt.got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1040
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25AD2F00000F1F440000
+ - Name: .plt.sec
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1050
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25BD2F00000F1F440000
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1060
+ AddressAlign: 0x10
+ Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+ - Name: .fini
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1134
+ AddressAlign: 0x4
+ Content: F30F1EFA4883EC084883C408C3
+ - Name: .rodata
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2000
+ AddressAlign: 0x1
+ Offset: 0x2000
+ Content: 48656C6C6F2066726F6D204200
+ - Name: .eh_frame_hdr
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2010
+ AddressAlign: 0x4
+ Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+ - Name: .eh_frame
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2040
+ AddressAlign: 0x8
+ Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+ - Name: .init_array
+ Type: SHT_INIT_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E10
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Offset: 0x2E10
+ Content: '1011000000000000'
+ - Name: .fini_array
+ Type: SHT_FINI_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E18
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: D010000000000000
+ - Name: .dynamic
+ Type: SHT_DYNAMIC
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E20
+ Link: .dynstr
+ AddressAlign: 0x8
+ Entries:
+ - Tag: DT_NEEDED
+ Value: 0x5F
+ - Tag: DT_INIT
+ Value: 0x1000
+ - Tag: DT_FINI
+ Value: 0x1134
+ - Tag: DT_INIT_ARRAY
+ Value: 0x3E10
+ - Tag: DT_INIT_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_FINI_ARRAY
+ Value: 0x3E18
+ - Tag: DT_FINI_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_GNU_HASH
+ Value: 0x2F0
+ - Tag: DT_STRTAB
+ Value: 0x3C0
+ - Tag: DT_SYMTAB
+ Value: 0x318
+ - Tag: DT_STRSZ
+ Value: 0x75
+ - Tag: DT_SYMENT
+ Value: 0x18
+ - Tag: DT_PLTGOT
+ Value: 0x4000
+ - Tag: DT_PLTRELSZ
+ Value: 0x18
+ - Tag: DT_PLTREL
+ Value: 0x7
+ - Tag: DT_JMPREL
+ Value: 0x510
+ - Tag: DT_RELA
+ Value: 0x468
+ - Tag: DT_RELASZ
+ Value: 0xA8
+ - Tag: DT_RELAENT
+ Value: 0x18
+ - Tag: DT_VERNEED
+ Value: 0x448
+ - Tag: DT_VERNEEDNUM
+ Value: 0x1
+ - Tag: DT_VERSYM
+ Value: 0x436
+ - Tag: DT_RELACOUNT
+ Value: 0x3
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Name: .got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3FE0
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '0000000000000000000000000000000000000000000000000000000000000000'
+ - Name: .got.plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4000
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '203E000000000000000000000000000000000000000000003010000000000000'
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4020
+ AddressAlign: 0x8
+ Content: '2040000000000000'
+ - Name: .bss
+ Type: SHT_NOBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4028
+ AddressAlign: 0x1
+ Size: 0x8
+ - Name: .comment
+ Type: SHT_PROGBITS
+ Flags: [ SHF_MERGE, SHF_STRINGS ]
+ AddressAlign: 0x1
+ EntSize: 0x1
+ Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+ - Name: crtstuff.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: deregister_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1060
+ - Name: register_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1090
+ - Name: __do_global_dtors_aux
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x10D0
+ - Name: completed.0
+ Type: STT_OBJECT
+ Section: .bss
+ Value: 0x4028
+ Size: 0x1
+ - Name: __do_global_dtors_aux_fini_array_entry
+ Type: STT_OBJECT
+ Section: .fini_array
+ Value: 0x3E18
+ - Name: frame_dummy
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1110
+ - Name: __frame_dummy_init_array_entry
+ Type: STT_OBJECT
+ Section: .init_array
+ Value: 0x3E10
+ - Name: libB.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: 'crtstuff.c (1)'
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: __FRAME_END__
+ Type: STT_OBJECT
+ Section: .eh_frame
+ Value: 0x20D0
+ - Type: STT_FILE
+ Index: SHN_ABS
+ - Name: _fini
+ Type: STT_FUNC
+ Section: .fini
+ Value: 0x1134
+ - Name: __dso_handle
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4020
+ - Name: _DYNAMIC
+ Type: STT_OBJECT
+ Section: .dynamic
+ Value: 0x3E20
+ - Name: __GNU_EH_FRAME_HDR
+ Section: .eh_frame_hdr
+ Value: 0x2010
+ - Name: __TMC_END__
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4028
+ - Name: _GLOBAL_OFFSET_TABLE_
+ Type: STT_OBJECT
+ Section: .got.plt
+ Value: 0x4000
+ - Name: _init
+ Type: STT_FUNC
+ Section: .init
+ Value: 0x1000
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: 'puts@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: sayB
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: '__cxa_finalize@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_WEAK
+DynamicSymbols:
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: puts
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: __cxa_finalize
+ Type: STT_FUNC
+ Binding: STB_WEAK
+ - Name: sayB
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
new file mode 100644
index 0000000..3d57c4f
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/B/B_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+ magic: 0xCAFEBABE
+ nfat_arch: 3
+FatArchs:
+ - cputype: 0x1000007
+ cpusubtype: 0x3
+ offset: 0x1000
+ size: 8376
+ align: 12
+ - cputype: 0x100000C
+ cpusubtype: 0x0
+ offset: 0x4000
+ size: 33376
+ align: 14
+ - cputype: 0x100000C
+ cpusubtype: 0x80000002
+ offset: 0x10000
+ size: 33376
+ align: 14
+Slices:
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x1000007
+ cpusubtype: 0x3
+ filetype: 0x6
+ ncmds: 14
+ sizeofcmds: 960
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 4096
+ fileoff: 0
+ filesize: 4096
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0xF80
+ size: 20
+ offset: 0xF80
+ align: 4
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 554889E5488D3D0F000000B000E8020000005DC3
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0xF94
+ size: 6
+ offset: 0xF94
+ align: 1
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x6
+ reserved3: 0x0
+ content: FF2566000000
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0xF9A
+ size: 14
+ offset: 0xF9A
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20420A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0xFA8
+ size: 88
+ offset: 0xFA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 4096
+ vmsize: 4096
+ fileoff: 4096
+ filesize: 4096
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x1000
+ size: 8
+ offset: 0x1000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 8192
+ vmsize: 4096
+ fileoff: 8192
+ filesize: 184
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 8192
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 8288
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 8320
+ nsyms: 2
+ stroff: 8360
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 8352
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 88B60B3C-13D3-3D7E-AEED-5F3E991FDF08
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 8312
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 8320
+ datasize: 0
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayB
+ Flags: 0x0
+ Address: 0xF80
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 3968
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayB
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0xF80 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0,
+ 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x0
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F70
+ size: 28
+ offset: 0x3F70
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0x3F8C
+ size: 12
+ offset: 0x3F8C
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0xC
+ reserved3: 0x0
+ content: 100000B0100240F900021FD6
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20420A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 90C3787A-22E1-35AE-9284-97A4842F88AF
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayB
+ Flags: 0x0
+ Address: 0x3F70
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16240
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayB
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F70 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x80000002
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F68
+ size: 32
+ offset: 0x3F68
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+ - sectname: __auth_stubs
+ segname: __TEXT
+ addr: 0x3F88
+ size: 16
+ offset: 0x3F88
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x10
+ reserved3: 0x0
+ content: 110000B031020091300240F9110A1FD7
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D20420A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __auth_got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 00000000000001C0
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 76B41B3A-00EC-388B-A432-478A96772CC4
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayB
+ Flags: 0x0
+ Address: 0x3F68
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16232
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayB
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F68 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
new file mode 100644
index 0000000..3fabf9a
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_linux.yaml
@@ -0,0 +1,450 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .rela.plt
+ Align: 0x1000
+ Offset: 0x0
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ FirstSec: .init
+ LastSec: .fini
+ VAddr: 0x1000
+ Align: 0x1000
+ Offset: 0x1000
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .eh_frame_hdr
+ LastSec: .eh_frame
+ VAddr: 0x2000
+ Align: 0x1000
+ Offset: 0x2000
+ - Type: PT_LOAD
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .init_array
+ LastSec: .bss
+ VAddr: 0x3E10
+ Align: 0x1000
+ Offset: 0x2E10
+ - Type: PT_DYNAMIC
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .dynamic
+ LastSec: .dynamic
+ VAddr: 0x3E20
+ Align: 0x8
+ Offset: 0x2E20
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+ VAddr: 0x2C8
+ Align: 0x4
+ Offset: 0x2C8
+ - Type: PT_GNU_PROPERTY
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_GNU_EH_FRAME
+ Flags: [ PF_R ]
+ FirstSec: .eh_frame_hdr
+ LastSec: .eh_frame_hdr
+ VAddr: 0x2000
+ Align: 0x4
+ Offset: 0x2000
+ - Type: PT_GNU_STACK
+ Flags: [ PF_W, PF_R ]
+ Align: 0x10
+ Offset: 0x0
+ - Type: PT_GNU_RELRO
+ Flags: [ PF_R ]
+ FirstSec: .init_array
+ LastSec: .got
+ VAddr: 0x3E10
+ Offset: 0x2E10
+Sections:
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2A8
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 020000C0040000000300000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2C8
+ AddressAlign: 0x4
+ Notes:
+ - Name: GNU
+ Desc: 0318D63E46BF31CEFF90D5C7F0475D9F78676EC8
+ Type: NT_PRPSINFO
+ - Name: .gnu.hash
+ Type: SHT_GNU_HASH
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2F0
+ Link: .dynsym
+ AddressAlign: 0x8
+ Header:
+ SymNdx: 0x8
+ Shift2: 0x6
+ BloomFilter: [ 0x400000200000 ]
+ HashBuckets: [ 0x0, 0x8 ]
+ HashValues: [ 0x7C9DCB95 ]
+ - Name: .dynsym
+ Type: SHT_DYNSYM
+ Flags: [ SHF_ALLOC ]
+ Address: 0x318
+ Link: .dynstr
+ AddressAlign: 0x8
+ - Name: .dynstr
+ Type: SHT_STRTAB
+ Flags: [ SHF_ALLOC ]
+ Address: 0x3F0
+ AddressAlign: 0x1
+ Content: "6C6962412E736F006C6962422E736F006C69625A2E736F00244F524947494E2F2E2E2F413A244F524947494E2F2E2E2F423A244F524947494E2F2E2E2F5A"
+ - Name: .rela.dyn
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC ]
+ Address: 0x498
+ Link: .dynsym
+ AddressAlign: 0x8
+ Relocations:
+ - Offset: 0x3E10
+ Type: R_X86_64_RELATIVE
+ Addend: 4432
+ - Offset: 0x3E18
+ Type: R_X86_64_RELATIVE
+ Addend: 4368
+ - Offset: 0x4030
+ Type: R_X86_64_RELATIVE
+ Addend: 16432
+ - Offset: 0x3FE0
+ Symbol: __cxa_finalize
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FE8
+ Symbol: _ITM_registerTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF0
+ Symbol: _ITM_deregisterTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF8
+ Symbol: __gmon_start__
+ Type: R_X86_64_GLOB_DAT
+ - Name: .rela.plt
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC, SHF_INFO_LINK ]
+ Address: 0x540
+ Link: .dynsym
+ AddressAlign: 0x8
+ Info: .got.plt
+ Relocations:
+ - Offset: 0x4018
+ Symbol: sayA
+ Type: R_X86_64_JUMP_SLOT
+ - Offset: 0x4020
+ Symbol: sayB
+ Type: R_X86_64_JUMP_SLOT
+ - Offset: 0x4028
+ Symbol: sayZ
+ Type: R_X86_64_JUMP_SLOT
+ - Name: .init
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x4
+ Offset: 0x1000
+ Content: F30F1EFA4883EC08488B05E92F00004885C07402FFD04883C408C3
+ - Name: .plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1020
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90F30F1EFA6801000000F2E9D1FFFFFF90F30F1EFA6802000000F2E9C1FFFFFF90
+ - Name: .plt.got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1060
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25752F00000F1F440000
+ - Name: .plt.sec
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1070
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF259D2F00000F1F440000F30F1EFAF2FF25952F00000F1F440000F30F1EFAF2FF258D2F00000F1F440000
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x10A0
+ AddressAlign: 0x10
+ Content: 488D3D912F0000488D058A2F00004839F87415488B05362F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D612F0000488D355A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05ED2E00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D1D2F000000752B5548833DBA2E0000004889E5740C488B3DFE2E0000E829FFFFFFE864FFFFFFC605F52E0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5B800000000E805FFFFFFB800000000E80BFFFFFFB800000000E811FFFFFF905DC3
+ - Name: .fini
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1184
+ AddressAlign: 0x4
+ Content: F30F1EFA4883EC084883C408C3
+ - Name: .eh_frame_hdr
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2000
+ AddressAlign: 0x4
+ Offset: 0x2000
+ Content: 011B033B2C0000000400000020F0FFFF4800000060F0FFFF7000000070F0FFFF8800000059F1FFFFA0000000
+ - Name: .eh_frame
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2030
+ AddressAlign: 0x8
+ Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000D0EFFFFF40000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000E8EFFFFF100000000000000000000000140000005C000000E0EFFFFF3000000000000000000000001C00000074000000B1F0FFFF2900000000450E108602430D06600C070800000000000000
+ - Name: .init_array
+ Type: SHT_INIT_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E10
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Offset: 0x2E10
+ Content: '5011000000000000'
+ - Name: .fini_array
+ Type: SHT_FINI_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E18
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '1011000000000000'
+ - Name: .dynamic
+ Type: SHT_DYNAMIC
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E20
+ Link: .dynstr
+ AddressAlign: 0x8
+ Entries:
+ - Tag: DT_NEEDED
+ Value: 0x0
+ - Tag: DT_NEEDED
+ Value: 0x8
+ - Tag: DT_NEEDED
+ Value: 0x10
+ - Tag: DT_RUNPATH
+ Value: 0x18
+ - Tag: DT_INIT
+ Value: 0x1000
+ - Tag: DT_FINI
+ Value: 0x1184
+ - Tag: DT_INIT_ARRAY
+ Value: 0x3E10
+ - Tag: DT_INIT_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_FINI_ARRAY
+ Value: 0x3E18
+ - Tag: DT_FINI_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_GNU_HASH
+ Value: 0x2F0
+ - Tag: DT_STRTAB
+ Value: 0x3F0
+ - Tag: DT_SYMTAB
+ Value: 0x318
+ - Tag: DT_STRSZ
+ Value: 0xA8
+ - Tag: DT_SYMENT
+ Value: 0x18
+ - Tag: DT_PLTGOT
+ Value: 0x4000
+ - Tag: DT_PLTRELSZ
+ Value: 0x48
+ - Tag: DT_PLTREL
+ Value: 0x7
+ - Tag: DT_JMPREL
+ Value: 0x540
+ - Tag: DT_RELA
+ Value: 0x498
+ - Tag: DT_RELASZ
+ Value: 0xA8
+ - Tag: DT_RELAENT
+ Value: 0x18
+ - Tag: DT_RELACOUNT
+ Value: 0x3
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Name: .got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3FE0
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '0000000000000000000000000000000000000000000000000000000000000000'
+ - Name: .got.plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4000
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '203E00000000000000000000000000000000000000000000301000000000000040100000000000005010000000000000'
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4030
+ AddressAlign: 0x8
+ Content: '3040000000000000'
+ - Name: .bss
+ Type: SHT_NOBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4038
+ AddressAlign: 0x1
+ Size: 0x8
+ - Name: .comment
+ Type: SHT_PROGBITS
+ Flags: [ SHF_MERGE, SHF_STRINGS ]
+ AddressAlign: 0x1
+ EntSize: 0x1
+ Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+ - Name: crtstuff.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: deregister_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x10A0
+ - Name: register_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x10D0
+ - Name: __do_global_dtors_aux
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1110
+ - Name: completed.0
+ Type: STT_OBJECT
+ Section: .bss
+ Value: 0x4038
+ Size: 0x1
+ - Name: __do_global_dtors_aux_fini_array_entry
+ Type: STT_OBJECT
+ Section: .fini_array
+ Value: 0x3E18
+ - Name: frame_dummy
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1150
+ - Name: __frame_dummy_init_array_entry
+ Type: STT_OBJECT
+ Section: .init_array
+ Value: 0x3E10
+ - Name: libC.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: 'crtstuff.c (1)'
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: __FRAME_END__
+ Type: STT_OBJECT
+ Section: .eh_frame
+ Value: 0x20C0
+ - Type: STT_FILE
+ Index: SHN_ABS
+ - Name: _DYNAMIC
+ Type: STT_OBJECT
+ Section: .dynamic
+ Value: 0x3E20
+ - Name: __TMC_END__
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4038
+ - Name: __dso_handle
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4030
+ - Name: _init
+ Type: STT_FUNC
+ Section: .init
+ Value: 0x1000
+ - Name: __GNU_EH_FRAME_HDR
+ Section: .eh_frame_hdr
+ Value: 0x2000
+ - Name: _fini
+ Type: STT_FUNC
+ Section: .fini
+ Value: 0x1184
+ - Name: _GLOBAL_OFFSET_TABLE_
+ Type: STT_OBJECT
+ Section: .got.plt
+ Value: 0x4000
+ - Name: __cxa_finalize
+ Binding: STB_WEAK
+ - Name: sayC
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1159
+ Size: 0x29
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: sayA
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: sayB
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: sayZ
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+DynamicSymbols:
+ - Name: __cxa_finalize
+ Binding: STB_WEAK
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: sayA
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: sayB
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: sayZ
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: sayC
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1159
+ Size: 0x29
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
new file mode 100644
index 0000000..ba33483
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/C/C_macho.yaml
@@ -0,0 +1,870 @@
+--- !fat-mach-o
+FatHeader:
+ magic: 0xCAFEBABE
+ nfat_arch: 3
+FatArchs:
+ - cputype: 0x1000007
+ cpusubtype: 0x3
+ offset: 0x1000
+ size: 8456
+ align: 12
+ - cputype: 0x100000C
+ cpusubtype: 0x0
+ offset: 0x4000
+ size: 33456
+ align: 14
+ - cputype: 0x100000C
+ cpusubtype: 0x80000002
+ offset: 0x10000
+ size: 33456
+ align: 14
+Slices:
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x1000007
+ cpusubtype: 0x3
+ filetype: 0x6
+ ncmds: 20
+ sizeofcmds: 1120
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 312
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 4096
+ fileoff: 0
+ filesize: 4096
+ maxprot: 5
+ initprot: 5
+ nsects: 3
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0xF70
+ size: 27
+ offset: 0xF70
+ align: 4
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 554889E5B000E811000000B000E810000000B000E80F0000005DC3
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0xF8C
+ size: 18
+ offset: 0xF8C
+ align: 1
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x6
+ reserved3: 0x0
+ content: FF256E000000FF2570000000FF2572000000
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0xFA0
+ size: 88
+ offset: 0xFA0
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000700F000040000000400000008B0F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 4096
+ vmsize: 4096
+ fileoff: 4096
+ filesize: 4096
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x1000
+ size: 24
+ offset: 0x1000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x3
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '000000000000108001000000000010800200000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 8192
+ vmsize: 4096
+ fileoff: 8192
+ filesize: 264
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libC.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 8192
+ datasize: 112
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 8304
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 8336
+ nsyms: 4
+ stroff: 8424
+ strsize: 32
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 3
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 8400
+ nindirectsyms: 6
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 2AA1F9E9-F250-366F-B382-51A91DE06BED
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../A'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../B'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../Z'
+ ZeroPadBytes: 3
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 8328
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 8336
+ datasize: 0
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayC
+ Flags: 0x0
+ Address: 0xF70
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 3952
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ - n_strx: 14
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 512
+ n_value: 0
+ - n_strx: 20
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 768
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayC
+ - _sayA
+ - _sayB
+ - _sayZ
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+ FunctionStarts: [ 0xF70 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0,
+ 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79,
+ 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0,
+ 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x0
+ filetype: 0x6
+ ncmds: 21
+ sizeofcmds: 1136
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 312
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 3
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F68
+ size: 28
+ offset: 0x3F68
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: FD7BBFA9FD030091050000940700009409000094FD7BC1A8C0035FD6
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0x3F84
+ size: 36
+ offset: 0x3F84
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0xC
+ reserved3: 0x0
+ content: 100000B0100240F900021FD6100000B0100640F900021FD6100000B0100A40F900021FD6
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000843F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 24
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x3
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '000000000000108001000000000010800200000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 688
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libC.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 112
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32880
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32912
+ nsyms: 4
+ stroff: 33000
+ strsize: 32
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 3
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32976
+ nindirectsyms: 6
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 02B69690-925D-35EE-A8AB-6D99813D2A16
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../A'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../B'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../Z'
+ ZeroPadBytes: 3
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32904
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32912
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 33040
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayC
+ Flags: 0x0
+ Address: 0x3F68
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16232
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ - n_strx: 14
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 512
+ n_value: 0
+ - n_strx: 20
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 768
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayC
+ - _sayA
+ - _sayB
+ - _sayZ
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+ FunctionStarts: [ 0x3F68 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79,
+ 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0,
+ 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x80000002
+ filetype: 0x6
+ ncmds: 21
+ sizeofcmds: 1136
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 312
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 3
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F58
+ size: 32
+ offset: 0x3F58
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 7F2303D5FD7BBFA9FD03009105000094080000940B000094FD7BC1A8FF0F5FD6
+ - sectname: __auth_stubs
+ segname: __TEXT
+ addr: 0x3F78
+ size: 48
+ offset: 0x3F78
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x10
+ reserved3: 0x0
+ content: 110000B031020091300240F9110A1FD7110000B031220091300240F9110A1FD7110000B031420091300240F9110A1FD7
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000583F00004000000040000000783F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __auth_got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 24
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x3
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 00000000000009C001000000000009C002000000000001C0
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 688
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libC.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 112
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32880
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32912
+ nsyms: 4
+ stroff: 33000
+ strsize: 32
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 3
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32976
+ nindirectsyms: 6
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: F54076AA-8888-3DED-8BDF-BC7FB3E6FE8A
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libA.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libB.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../A'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../B'
+ ZeroPadBytes: 3
+ - cmd: LC_RPATH
+ cmdsize: 32
+ path: 12
+ Content: '@loader_path/../Z'
+ ZeroPadBytes: 3
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32904
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32912
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 33040
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayC
+ Flags: 0x0
+ Address: 0x3F58
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16216
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ - n_strx: 14
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 512
+ n_value: 0
+ - n_strx: 20
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 768
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayC
+ - _sayA
+ - _sayB
+ - _sayZ
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ - ''
+ IndirectSymbols: [ 0x1, 0x2, 0x3, 0x1, 0x2, 0x3 ]
+ FunctionStarts: [ 0x3F58 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x58, 0x0, 0x0, 0x0, 0x3, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x2, 0xE, 0x0, 0x0, 0x3, 0x1A, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x73, 0x61, 0x79,
+ 0x41, 0x0, 0x5F, 0x73, 0x61, 0x79, 0x42, 0x0,
+ 0x5F, 0x73, 0x61, 0x79, 0x5A, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
new file mode 100644
index 0000000..5561f29
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_linux.yaml
@@ -0,0 +1,460 @@
+--- !ELF
+FileHeader:
+ Class: ELFCLASS64
+ Data: ELFDATA2LSB
+ Type: ET_DYN
+ Machine: EM_X86_64
+ProgramHeaders:
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .rela.plt
+ Align: 0x1000
+ Offset: 0x0
+ - Type: PT_LOAD
+ Flags: [ PF_X, PF_R ]
+ FirstSec: .init
+ LastSec: .fini
+ VAddr: 0x1000
+ Align: 0x1000
+ Offset: 0x1000
+ - Type: PT_LOAD
+ Flags: [ PF_R ]
+ FirstSec: .rodata
+ LastSec: .eh_frame
+ VAddr: 0x2000
+ Align: 0x1000
+ Offset: 0x2000
+ - Type: PT_LOAD
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .init_array
+ LastSec: .bss
+ VAddr: 0x3E10
+ Align: 0x1000
+ Offset: 0x2E10
+ - Type: PT_DYNAMIC
+ Flags: [ PF_W, PF_R ]
+ FirstSec: .dynamic
+ LastSec: .dynamic
+ VAddr: 0x3E20
+ Align: 0x8
+ Offset: 0x2E20
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_NOTE
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.build-id
+ LastSec: .note.gnu.build-id
+ VAddr: 0x2C8
+ Align: 0x4
+ Offset: 0x2C8
+ - Type: PT_GNU_PROPERTY
+ Flags: [ PF_R ]
+ FirstSec: .note.gnu.property
+ LastSec: .note.gnu.property
+ VAddr: 0x2A8
+ Align: 0x8
+ Offset: 0x2A8
+ - Type: PT_GNU_EH_FRAME
+ Flags: [ PF_R ]
+ FirstSec: .eh_frame_hdr
+ LastSec: .eh_frame_hdr
+ VAddr: 0x2010
+ Align: 0x4
+ Offset: 0x2010
+ - Type: PT_GNU_STACK
+ Flags: [ PF_W, PF_R ]
+ Align: 0x10
+ Offset: 0x0
+ - Type: PT_GNU_RELRO
+ Flags: [ PF_R ]
+ FirstSec: .init_array
+ LastSec: .got
+ VAddr: 0x3E10
+ Offset: 0x2E10
+Sections:
+ - Name: .note.gnu.property
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2A8
+ AddressAlign: 0x8
+ Notes:
+ - Name: GNU
+ Desc: 020000C0040000000300000000000000
+ Type: NT_GNU_PROPERTY_TYPE_0
+ - Name: .note.gnu.build-id
+ Type: SHT_NOTE
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2C8
+ AddressAlign: 0x4
+ Notes:
+ - Name: GNU
+ Desc: 640A4A3AC0DF6BA3DAC3B51CCD727245117E0B30
+ Type: NT_PRPSINFO
+ - Name: .gnu.hash
+ Type: SHT_GNU_HASH
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2F0
+ Link: .dynsym
+ AddressAlign: 0x8
+ Header:
+ SymNdx: 0x6
+ Shift2: 0x6
+ BloomFilter: [ 0x500000000000 ]
+ HashBuckets: [ 0x6, 0x0 ]
+ HashValues: [ 0x7C9DCBAD ]
+ - Name: .dynsym
+ Type: SHT_DYNSYM
+ Flags: [ SHF_ALLOC ]
+ Address: 0x318
+ Link: .dynstr
+ AddressAlign: 0x8
+ - Name: .dynstr
+ Type: SHT_STRTAB
+ Flags: [ SHF_ALLOC ]
+ Address: 0x3C0
+ AddressAlign: 0x1
+ - Name: .gnu.version
+ Type: SHT_GNU_versym
+ Flags: [ SHF_ALLOC ]
+ Address: 0x436
+ Link: .dynsym
+ AddressAlign: 0x2
+ Entries: [ 0, 1, 2, 1, 1, 2, 1 ]
+ - Name: .gnu.version_r
+ Type: SHT_GNU_verneed
+ Flags: [ SHF_ALLOC ]
+ Address: 0x448
+ Link: .dynstr
+ AddressAlign: 0x8
+ Dependencies:
+ - Version: 1
+ File: libc.so.6
+ Entries:
+ - Name: GLIBC_2.2.5
+ Hash: 157882997
+ Flags: 0
+ Other: 2
+ - Name: .rela.dyn
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC ]
+ Address: 0x468
+ Link: .dynsym
+ AddressAlign: 0x8
+ Relocations:
+ - Offset: 0x3E10
+ Type: R_X86_64_RELATIVE
+ Addend: 4368
+ - Offset: 0x3E18
+ Type: R_X86_64_RELATIVE
+ Addend: 4304
+ - Offset: 0x4020
+ Type: R_X86_64_RELATIVE
+ Addend: 16416
+ - Offset: 0x3FE0
+ Symbol: _ITM_deregisterTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FE8
+ Symbol: __gmon_start__
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF0
+ Symbol: _ITM_registerTMCloneTable
+ Type: R_X86_64_GLOB_DAT
+ - Offset: 0x3FF8
+ Symbol: __cxa_finalize
+ Type: R_X86_64_GLOB_DAT
+ - Name: .rela.plt
+ Type: SHT_RELA
+ Flags: [ SHF_ALLOC, SHF_INFO_LINK ]
+ Address: 0x510
+ Link: .dynsym
+ AddressAlign: 0x8
+ Info: .got.plt
+ Relocations:
+ - Offset: 0x4018
+ Symbol: puts
+ Type: R_X86_64_JUMP_SLOT
+ - Name: .init
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1000
+ AddressAlign: 0x4
+ Offset: 0x1000
+ Content: F30F1EFA4883EC08488B05D92F00004885C07402FFD04883C408C3
+ - Name: .plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1020
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: FF35E22F0000F2FF25E32F00000F1F00F30F1EFA6800000000F2E9E1FFFFFF90
+ - Name: .plt.got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1040
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25AD2F00000F1F440000
+ - Name: .plt.sec
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1050
+ AddressAlign: 0x10
+ EntSize: 0x10
+ Content: F30F1EFAF2FF25BD2F00000F1F440000
+ - Name: .text
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1060
+ AddressAlign: 0x10
+ Content: 488D3DC12F0000488D05BA2F00004839F87415488B05662F00004885C07409FFE00F1F8000000000C30F1F8000000000488D3D912F0000488D358A2F00004829FE4889F048C1EE3F48C1F8034801C648D1FE7414488B05352F00004885C07408FFE0660F1F440000C30F1F8000000000F30F1EFA803D4D2F000000752B5548833D122F0000004889E5740C488B3D2E2F0000E849FFFFFFE864FFFFFFC605252F0000015DC30F1F00C30F1F8000000000F30F1EFAE977FFFFFFF30F1EFA554889E5488D05D80E00004889C7E820FFFFFF905DC3
+ - Name: .fini
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC, SHF_EXECINSTR ]
+ Address: 0x1134
+ AddressAlign: 0x4
+ Content: F30F1EFA4883EC084883C408C3
+ - Name: .rodata
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2000
+ AddressAlign: 0x1
+ Offset: 0x2000
+ Content: 48656C6C6F2066726F6D205A00
+ - Name: .eh_frame_hdr
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2010
+ AddressAlign: 0x4
+ Content: 011B033B2C0000000400000010F0FFFF4800000030F0FFFF7000000040F0FFFF8800000009F1FFFFA0000000
+ - Name: .eh_frame
+ Type: SHT_PROGBITS
+ Flags: [ SHF_ALLOC ]
+ Address: 0x2040
+ AddressAlign: 0x8
+ Content: 1400000000000000017A5200017810011B0C070890010000240000001C000000C0EFFFFF20000000000E10460E184A0F0B770880003F1A3A2A332422000000001400000044000000B8EFFFFF100000000000000000000000140000005C000000B0EFFFFF1000000000000000000000001C0000007400000061F0FFFF1A00000000450E108602430D06510C070800000000000000
+ - Name: .init_array
+ Type: SHT_INIT_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E10
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Offset: 0x2E10
+ Content: '1011000000000000'
+ - Name: .fini_array
+ Type: SHT_FINI_ARRAY
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E18
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: D010000000000000
+ - Name: .dynamic
+ Type: SHT_DYNAMIC
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3E20
+ Link: .dynstr
+ AddressAlign: 0x8
+ Entries:
+ - Tag: DT_NEEDED
+ Value: 0x5F
+ - Tag: DT_INIT
+ Value: 0x1000
+ - Tag: DT_FINI
+ Value: 0x1134
+ - Tag: DT_INIT_ARRAY
+ Value: 0x3E10
+ - Tag: DT_INIT_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_FINI_ARRAY
+ Value: 0x3E18
+ - Tag: DT_FINI_ARRAYSZ
+ Value: 0x8
+ - Tag: DT_GNU_HASH
+ Value: 0x2F0
+ - Tag: DT_STRTAB
+ Value: 0x3C0
+ - Tag: DT_SYMTAB
+ Value: 0x318
+ - Tag: DT_STRSZ
+ Value: 0x75
+ - Tag: DT_SYMENT
+ Value: 0x18
+ - Tag: DT_PLTGOT
+ Value: 0x4000
+ - Tag: DT_PLTRELSZ
+ Value: 0x18
+ - Tag: DT_PLTREL
+ Value: 0x7
+ - Tag: DT_JMPREL
+ Value: 0x510
+ - Tag: DT_RELA
+ Value: 0x468
+ - Tag: DT_RELASZ
+ Value: 0xA8
+ - Tag: DT_RELAENT
+ Value: 0x18
+ - Tag: DT_VERNEED
+ Value: 0x448
+ - Tag: DT_VERNEEDNUM
+ Value: 0x1
+ - Tag: DT_VERSYM
+ Value: 0x436
+ - Tag: DT_RELACOUNT
+ Value: 0x3
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Tag: DT_NULL
+ Value: 0x0
+ - Name: .got
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x3FE0
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '0000000000000000000000000000000000000000000000000000000000000000'
+ - Name: .got.plt
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4000
+ AddressAlign: 0x8
+ EntSize: 0x8
+ Content: '203E000000000000000000000000000000000000000000003010000000000000'
+ - Name: .data
+ Type: SHT_PROGBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4020
+ AddressAlign: 0x8
+ Content: '2040000000000000'
+ - Name: .bss
+ Type: SHT_NOBITS
+ Flags: [ SHF_WRITE, SHF_ALLOC ]
+ Address: 0x4028
+ AddressAlign: 0x1
+ Size: 0x8
+ - Name: .comment
+ Type: SHT_PROGBITS
+ Flags: [ SHF_MERGE, SHF_STRINGS ]
+ AddressAlign: 0x1
+ EntSize: 0x1
+ Content: 4743433A20285562756E74752031312E342E302D317562756E7475317E32322E30342E32292031312E342E3000
+Symbols:
+ - Name: crtstuff.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: deregister_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1060
+ - Name: register_tm_clones
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1090
+ - Name: __do_global_dtors_aux
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x10D0
+ - Name: completed.0
+ Type: STT_OBJECT
+ Section: .bss
+ Value: 0x4028
+ Size: 0x1
+ - Name: __do_global_dtors_aux_fini_array_entry
+ Type: STT_OBJECT
+ Section: .fini_array
+ Value: 0x3E18
+ - Name: frame_dummy
+ Type: STT_FUNC
+ Section: .text
+ Value: 0x1110
+ - Name: __frame_dummy_init_array_entry
+ Type: STT_OBJECT
+ Section: .init_array
+ Value: 0x3E10
+ - Name: libZ.c
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: 'crtstuff.c (1)'
+ Type: STT_FILE
+ Index: SHN_ABS
+ - Name: __FRAME_END__
+ Type: STT_OBJECT
+ Section: .eh_frame
+ Value: 0x20D0
+ - Type: STT_FILE
+ Index: SHN_ABS
+ - Name: _fini
+ Type: STT_FUNC
+ Section: .fini
+ Value: 0x1134
+ - Name: __dso_handle
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4020
+ - Name: _DYNAMIC
+ Type: STT_OBJECT
+ Section: .dynamic
+ Value: 0x3E20
+ - Name: __GNU_EH_FRAME_HDR
+ Section: .eh_frame_hdr
+ Value: 0x2010
+ - Name: __TMC_END__
+ Type: STT_OBJECT
+ Section: .data
+ Value: 0x4028
+ - Name: _GLOBAL_OFFSET_TABLE_
+ Type: STT_OBJECT
+ Section: .got.plt
+ Value: 0x4000
+ - Name: _init
+ Type: STT_FUNC
+ Section: .init
+ Value: 0x1000
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: 'puts@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: sayZ
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: '__cxa_finalize@GLIBC_2.2.5'
+ Type: STT_FUNC
+ Binding: STB_WEAK
+DynamicSymbols:
+ - Name: _ITM_deregisterTMCloneTable
+ Binding: STB_WEAK
+ - Name: puts
+ Type: STT_FUNC
+ Binding: STB_GLOBAL
+ - Name: __gmon_start__
+ Binding: STB_WEAK
+ - Name: _ITM_registerTMCloneTable
+ Binding: STB_WEAK
+ - Name: __cxa_finalize
+ Type: STT_FUNC
+ Binding: STB_WEAK
+ - Name: sayZ
+ Type: STT_FUNC
+ Section: .text
+ Binding: STB_GLOBAL
+ Value: 0x1119
+ Size: 0x1A
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
new file mode 100644
index 0000000..c0c1826
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/Inputs/Z/Z_macho.yaml
@@ -0,0 +1,723 @@
+--- !fat-mach-o
+FatHeader:
+ magic: 0xCAFEBABE
+ nfat_arch: 3
+FatArchs:
+ - cputype: 0x1000007
+ cpusubtype: 0x3
+ offset: 0x1000
+ size: 8376
+ align: 12
+ - cputype: 0x100000C
+ cpusubtype: 0x0
+ offset: 0x4000
+ size: 33376
+ align: 14
+ - cputype: 0x100000C
+ cpusubtype: 0x80000002
+ offset: 0x10000
+ size: 33376
+ align: 14
+Slices:
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x1000007
+ cpusubtype: 0x3
+ filetype: 0x6
+ ncmds: 14
+ sizeofcmds: 960
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 4096
+ fileoff: 0
+ filesize: 4096
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0xF80
+ size: 20
+ offset: 0xF80
+ align: 4
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 554889E5488D3D0F000000B000E8020000005DC3
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0xF94
+ size: 6
+ offset: 0xF94
+ align: 1
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x6
+ reserved3: 0x0
+ content: FF2566000000
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0xF9A
+ size: 14
+ offset: 0xF9A
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D205A0A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0xFA8
+ size: 88
+ offset: 0xFA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000800F00004000000040000000940F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000100000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 4096
+ vmsize: 4096
+ fileoff: 4096
+ filesize: 4096
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x1000
+ size: 8
+ offset: 0x1000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 8192
+ vmsize: 4096
+ fileoff: 8192
+ filesize: 184
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 8192
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 8288
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 8320
+ nsyms: 2
+ stroff: 8360
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 8352
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 399E203C-FF9A-3B80-872C-85F3A759A78B
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 8312
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 8320
+ datasize: 0
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayZ
+ Flags: 0x0
+ Address: 0xF80
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 3968
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayZ
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0xF80 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x10, 0x6, 0x0,
+ 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x0
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F70
+ size: 28
+ offset: 0x3F70
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8C0035FD6
+ - sectname: __stubs
+ segname: __TEXT
+ addr: 0x3F8C
+ size: 12
+ offset: 0x3F8C
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0xC
+ reserved3: 0x0
+ content: 100000B0100240F900021FD6
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D205A0A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000703F000040000000400000008C3F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: '0000000000000080'
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: 6E8E78AF-EDB2-3830-BE1E-013390302CC5
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayZ
+ Flags: 0x0
+ Address: 0x3F70
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16240
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayZ
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F70 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0x6, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+ - !mach-o
+ FileHeader:
+ magic: 0xFEEDFACF
+ cputype: 0x100000C
+ cpusubtype: 0x80000002
+ filetype: 0x6
+ ncmds: 15
+ sizeofcmds: 976
+ flags: 0x100085
+ reserved: 0x0
+ LoadCommands:
+ - cmd: LC_SEGMENT_64
+ cmdsize: 392
+ segname: __TEXT
+ vmaddr: 0
+ vmsize: 16384
+ fileoff: 0
+ filesize: 16384
+ maxprot: 5
+ initprot: 5
+ nsects: 4
+ flags: 0
+ Sections:
+ - sectname: __text
+ segname: __TEXT
+ addr: 0x3F68
+ size: 32
+ offset: 0x3F68
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000400
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 7F2303D5FD7BBFA9FD0300910000009000603E9103000094FD7BC1A8FF0F5FD6
+ - sectname: __auth_stubs
+ segname: __TEXT
+ addr: 0x3F88
+ size: 16
+ offset: 0x3F88
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x80000408
+ reserved1: 0x0
+ reserved2: 0x10
+ reserved3: 0x0
+ content: 110000B031020091300240F9110A1FD7
+ - sectname: __cstring
+ segname: __TEXT
+ addr: 0x3F98
+ size: 14
+ offset: 0x3F98
+ align: 0
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x2
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 48656C6C6F2066726F6D205A0A00
+ - sectname: __unwind_info
+ segname: __TEXT
+ addr: 0x3FA8
+ size: 88
+ offset: 0x3FA8
+ align: 2
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x0
+ reserved1: 0x0
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 010000001C000000000000001C000000000000001C00000002000000683F00004000000040000000883F00000000000040000000000000000000000000000000030000000C00010010000100000000000000000400000000
+ - cmd: LC_SEGMENT_64
+ cmdsize: 152
+ segname: __DATA_CONST
+ vmaddr: 16384
+ vmsize: 16384
+ fileoff: 16384
+ filesize: 16384
+ maxprot: 3
+ initprot: 3
+ nsects: 1
+ flags: 16
+ Sections:
+ - sectname: __auth_got
+ segname: __DATA_CONST
+ addr: 0x4000
+ size: 8
+ offset: 0x4000
+ align: 3
+ reloff: 0x0
+ nreloc: 0
+ flags: 0x6
+ reserved1: 0x1
+ reserved2: 0x0
+ reserved3: 0x0
+ content: 00000000000001C0
+ - cmd: LC_SEGMENT_64
+ cmdsize: 72
+ segname: __LINKEDIT
+ vmaddr: 32768
+ vmsize: 16384
+ fileoff: 32768
+ filesize: 608
+ maxprot: 1
+ initprot: 1
+ nsects: 0
+ flags: 0
+ - cmd: LC_ID_DYLIB
+ cmdsize: 48
+ dylib:
+ name: 24
+ timestamp: 1
+ current_version: 0
+ compatibility_version: 0
+ Content: '@rpath/libZ.dylib'
+ ZeroPadBytes: 7
+ - cmd: LC_DYLD_CHAINED_FIXUPS
+ cmdsize: 16
+ dataoff: 32768
+ datasize: 96
+ - cmd: LC_DYLD_EXPORTS_TRIE
+ cmdsize: 16
+ dataoff: 32864
+ datasize: 24
+ - cmd: LC_SYMTAB
+ cmdsize: 24
+ symoff: 32896
+ nsyms: 2
+ stroff: 32936
+ strsize: 16
+ - cmd: LC_DYSYMTAB
+ cmdsize: 80
+ ilocalsym: 0
+ nlocalsym: 0
+ iextdefsym: 0
+ nextdefsym: 1
+ iundefsym: 1
+ nundefsym: 1
+ tocoff: 0
+ ntoc: 0
+ modtaboff: 0
+ nmodtab: 0
+ extrefsymoff: 0
+ nextrefsyms: 0
+ indirectsymoff: 32928
+ nindirectsyms: 2
+ extreloff: 0
+ nextrel: 0
+ locreloff: 0
+ nlocrel: 0
+ - cmd: LC_UUID
+ cmdsize: 24
+ uuid: E74F368D-238F-31FA-BF40-FA2964FED986
+ - cmd: LC_BUILD_VERSION
+ cmdsize: 32
+ platform: 1
+ minos: 983040
+ sdk: 983552
+ ntools: 1
+ Tools:
+ - tool: 3
+ version: 73074435
+ - cmd: LC_SOURCE_VERSION
+ cmdsize: 16
+ version: 0
+ - cmd: LC_LOAD_DYLIB
+ cmdsize: 56
+ dylib:
+ name: 24
+ timestamp: 2
+ current_version: 88539136
+ compatibility_version: 65536
+ Content: '/usr/lib/libSystem.B.dylib'
+ ZeroPadBytes: 6
+ - cmd: LC_FUNCTION_STARTS
+ cmdsize: 16
+ dataoff: 32888
+ datasize: 8
+ - cmd: LC_DATA_IN_CODE
+ cmdsize: 16
+ dataoff: 32896
+ datasize: 0
+ - cmd: LC_CODE_SIGNATURE
+ cmdsize: 16
+ dataoff: 32960
+ datasize: 416
+ LinkEditData:
+ ExportTrie:
+ TerminalSize: 0
+ NodeOffset: 0
+ Name: ''
+ Flags: 0x0
+ Address: 0x0
+ Other: 0x0
+ ImportName: ''
+ Children:
+ - TerminalSize: 3
+ NodeOffset: 13
+ Name: _sayZ
+ Flags: 0x0
+ Address: 0x3F68
+ Other: 0x0
+ ImportName: ''
+ NameList:
+ - n_strx: 2
+ n_type: 0xF
+ n_sect: 1
+ n_desc: 0
+ n_value: 16232
+ - n_strx: 8
+ n_type: 0x1
+ n_sect: 0
+ n_desc: 256
+ n_value: 0
+ StringTable:
+ - ' '
+ - _sayZ
+ - _printf
+ IndirectSymbols: [ 0x1, 0x1 ]
+ FunctionStarts: [ 0x3F68 ]
+ ChainedFixups: [ 0x0, 0x0, 0x0, 0x0, 0x20, 0x0, 0x0, 0x0, 0x48,
+ 0x0, 0x0, 0x0, 0x50, 0x0, 0x0, 0x0, 0x1, 0x0,
+ 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x10, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x18, 0x0, 0x0, 0x0, 0x0, 0x40, 0xC, 0x0,
+ 0x0, 0x40, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x2, 0x0,
+ 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x5F, 0x70, 0x72,
+ 0x69, 0x6E, 0x74, 0x66, 0x0, 0x0, 0x0, 0x0, 0x0,
+ 0x0, 0x0, 0x0 ]
+...
diff --git a/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
new file mode 100644
index 0000000..2a396da
--- /dev/null
+++ b/llvm/unittests/ExecutionEngine/Orc/LibraryResolverTest.cpp
@@ -0,0 +1,764 @@
+//===- LibraryResolverTest.cpp - Unit tests for LibraryResolver -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryResolver.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/TargetProcess/LibraryScanner.h"
+#include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/Testing/Support/SupportHelpers.h"
+
+#include "gtest/gtest.h"
+
+#include <algorithm>
+#include <optional>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::orc;
+
+// Disabled due to test setup issue — YAML to shared library creation seems
+// invalid on some build bots. (PR #165360) Not related to code logic.
+#if 0
+// TODO: Add COFF (Windows) support for these tests.
+// this facility also works correctly on Windows (COFF),
+// so we should eventually enable and run these tests for that platform as well.
+namespace {
+
+#if defined(__APPLE__)
+constexpr const char *ext = ".dylib";
+#elif defined(_WIN32)
+constexpr const char *ext = ".dll";
+#else
+constexpr const char *ext = ".so";
+#endif
+
+bool EnvReady = false;
+
+Triple getTargetTriple() {
+ auto JTMB = JITTargetMachineBuilder::detectHost();
+ if (!JTMB) {
+ consumeError(JTMB.takeError());
+ return Triple();
+ }
+ return JTMB->getTargetTriple();
+}
+
+static bool CheckHostSupport() {
+ auto Triple = getTargetTriple();
+ // TODO: Extend support to COFF (Windows) once test setup and YAML conversion
+ // are verified.
+ if (!Triple.isOSBinFormatMachO() &&
+ !(Triple.isOSBinFormatELF() && Triple.getArch() == Triple::x86_64))
+ return false;
+
+ return true;
+}
+
+std::string getYamlFilePlatformExt() {
+ auto Triple = getTargetTriple();
+ if (Triple.isOSBinFormatMachO())
+ return "_macho";
+ else if (Triple.isOSBinFormatELF())
+ return "_linux";
+
+ return "";
+}
+
+unsigned getYamlDocNum() {
+ // auto Triple = getTargetTriple();
+ // if (Triple.isOSBinFormatELF())
+ // return 1;
+
+ return 1;
+}
+
+class LibraryTestEnvironment : public ::testing::Environment {
+ std::vector<std::string> CreatedDylibsDir;
+ std::vector<std::string> CreatedDylibs;
+ SmallVector<char, 128> DirPath;
+
+public:
+ void SetUp() override {
+ if (!CheckHostSupport()) {
+ EnvReady = false;
+ return;
+ }
+
+ StringRef ThisFile = __FILE__;
+ SmallVector<char, 128> InputDirPath(ThisFile.begin(), ThisFile.end());
+ sys::path::remove_filename(InputDirPath);
+ sys::path::append(InputDirPath, "Inputs");
+ if (!sys::fs::exists(InputDirPath))
+ return;
+
+ SmallString<128> UniqueDir;
+ sys::path::append(UniqueDir, InputDirPath);
+ std::error_code EC = sys::fs::createUniqueDirectory(UniqueDir, DirPath);
+
+ if (EC)
+ return;
+
+ // given yamlPath + DylibPath, validate + convert
+ auto processYamlToDylib = [&](const SmallVector<char, 128> &YamlPath,
+ const SmallVector<char, 128> &DylibPath,
+ unsigned DocNum) -> bool {
+ if (!sys::fs::exists(YamlPath)) {
+ errs() << "YAML file missing: "
+ << StringRef(YamlPath.data(), YamlPath.size()) << "\n";
+ EnvReady = false;
+ return false;
+ }
+
+ auto BufOrErr = MemoryBuffer::getFile(YamlPath);
+ if (!BufOrErr) {
+ errs() << "Failed to read "
+ << StringRef(YamlPath.data(), YamlPath.size()) << ": "
+ << BufOrErr.getError().message() << "\n";
+ EnvReady = false;
+ return false;
+ }
+
+ yaml::Input yin(BufOrErr->get()->getBuffer());
+ std::error_code EC;
+ raw_fd_ostream outFile(StringRef(DylibPath.data(), DylibPath.size()), EC,
+ sys::fs::OF_None);
+
+ if (EC) {
+ errs() << "Failed to open "
+ << StringRef(DylibPath.data(), DylibPath.size())
+ << " for writing: " << EC.message() << "\n";
+ EnvReady = false;
+ return false;
+ }
+
+ if (!yaml::convertYAML(
+ yin, outFile,
+ [](const Twine &M) {
+ // Handle or ignore errors here
+ errs() << "Yaml Error :" << M << "\n";
+ },
+ DocNum)) {
+ errs() << "Failed to convert "
+ << StringRef(YamlPath.data(), YamlPath.size()) << " to "
+ << StringRef(DylibPath.data(), DylibPath.size()) << "\n";
+ EnvReady = false;
+ return false;
+ }
+
+ CreatedDylibsDir.push_back(std::string(sys::path::parent_path(
+ StringRef(DylibPath.data(), DylibPath.size()))));
+ CreatedDylibs.push_back(std::string(DylibPath.begin(), DylibPath.end()));
+ return true;
+ };
+
+ std::vector<const char *> LibDirs = {"Z", "A", "B", "C"};
+
+ unsigned DocNum = getYamlDocNum();
+ std::string YamlPltExt = getYamlFilePlatformExt();
+ for (const auto &LibdirName : LibDirs) {
+ // YAML path
+ SmallVector<char, 128> YamlPath(InputDirPath.begin(), InputDirPath.end());
+ SmallVector<char, 128> YamlFileName;
+ YamlFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+ YamlFileName.append(YamlPltExt.begin(), YamlPltExt.end());
+ sys::path::append(YamlPath, LibdirName, YamlFileName);
+ sys::path::replace_extension(YamlPath, ".yaml");
+
+ // dylib path
+ SmallVector<char, 128> DylibPath(DirPath.begin(), DirPath.end());
+ SmallVector<char, 128> DylibFileName;
+ StringRef prefix("lib");
+ DylibFileName.append(prefix.begin(), prefix.end());
+ DylibFileName.append(LibdirName, LibdirName + strlen(LibdirName));
+
+ sys::path::append(DylibPath, LibdirName);
+ if (!sys::fs::exists(DylibPath)) {
+ auto EC = sys::fs::create_directory(DylibPath);
+ if (EC)
+ return;
+ }
+ sys::path::append(DylibPath, DylibFileName);
+ sys::path::replace_extension(DylibPath, ext);
+ if (!processYamlToDylib(YamlPath, DylibPath, DocNum))
+ return;
+ }
+
+ EnvReady = true;
+ }
+
+ void TearDown() override { sys::fs::remove_directories(DirPath); }
+
+ std::string getBaseDir() const {
+ return std::string(DirPath.begin(), DirPath.end());
+ }
+
+ std::vector<std::string> getDylibPaths() const { return CreatedDylibs; }
+};
+
+static LibraryTestEnvironment *GlobalEnv =
+ static_cast<LibraryTestEnvironment *>(
+ ::testing::AddGlobalTestEnvironment(new LibraryTestEnvironment()));
+
+inline std::string libPath(const std::string &BaseDir,
+ const std::string &name) {
+#if defined(__APPLE__)
+ return BaseDir + "/" + name + ".dylib";
+#elif defined(_WIN32)
+ return BaseDir + "/" + name + ".dll";
+#else
+ return BaseDir + "/" + name + ".so";
+#endif
+}
+
+inline std::string withext(const std::string &lib) {
+ SmallString<128> P(lib);
+ sys::path::replace_extension(P, ext);
+ return P.str().str();
+}
+
+inline std::string platformSymbolName(const std::string &name) {
+#if defined(__APPLE__)
+ return "_" + name; // macOS prepends underscore
+#else
+ return name;
+#endif
+}
+
+struct TestLibrary {
+ std::string path;
+ std::vector<std::string> Syms;
+};
+
+class LibraryResolverIT : public ::testing::Test {
+protected:
+ std::string BaseDir;
+ std::unordered_map<std::string, TestLibrary> libs;
+
+ void addLib(const std::string &name) {
+ SmallString<512> path;
+ std::error_code EC =
+ sys::fs::real_path(libPath(BaseDir, name + "/lib" + name), path);
+ if (EC || path.empty() || !sys::fs::exists(path))
+ GTEST_SKIP();
+ libs[name] = {path.str().str(), {platformSymbolName("say" + name)}};
+ }
+
+ void SetUp() override {
+ if (!EnvReady || GlobalEnv == nullptr)
+ GTEST_SKIP() << "Skipping test: environment setup failed.";
+
+ {
+ SmallString<512> path;
+ std::error_code EC = sys::fs::real_path(GlobalEnv->getBaseDir(), path);
+ if (path.empty() || EC)
+ GTEST_SKIP() << "Base directory resolution failed: " << EC.message();
+ BaseDir = path.str().str();
+ }
+
+ for (const auto &P : GlobalEnv->getDylibPaths()) {
+ if (!sys::fs::exists(P))
+ GTEST_SKIP() << "Missing dylib path: " << P;
+ }
+
+ const std::vector<std::string> libNames = {"A", "B", "C", "Z"};
+ for (const auto &name : libNames)
+ addLib(name);
+
+ if (!EnvReady)
+ GTEST_SKIP() << "Skipping test: environment setup failed.";
+ }
+
+ const std::vector<std::string> &sym(const std::string &key) {
+ return libs[key].Syms;
+ }
+ const std::string &lib(const std::string &key) { return libs[key].path; }
+ const std::string libdir(const std::string &key) {
+ SmallString<512> P(libs[key].path);
+ sys::path::remove_filename(P);
+ return P.str().str();
+ }
+ const std::string libname(const std::string &key) {
+ return sys::path::filename(libs[key].path).str();
+ }
+};
+
+// Helper: allow either "sayA" or "_sayA" depending on how your
+// SymbolEnumerator reports.
+static bool matchesEitherUnderscore(const std::string &got,
+ const std::string &bare) {
+ return got == bare || got == ("_" + bare);
+}
+
+// Helper: normalize path ending check (we only care that it resolved to the
+// right dylib)
+static bool endsWith(const std::string &s, const std::string &suffix) {
+ if (s.size() < suffix.size())
+ return false;
+ return std::equal(suffix.rbegin(), suffix.rend(), s.rbegin());
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_ExportsOnly_DefaultFlags) {
+ const std::string libC = lib("C");
+ SymbolEnumeratorOptions Opts = SymbolEnumeratorOptions::defaultOptions();
+
+ std::vector<std::string> seen;
+ auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+ seen.emplace_back(sym.str());
+ return EnumerateResult::Continue;
+ };
+
+ ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+ // sayC is exported, others are undefined → only sayC expected
+ EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayC");
+ }));
+ EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayA");
+ }));
+ EXPECT_FALSE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayB");
+ }));
+}
+
+TEST_F(LibraryResolverIT, EnumerateSymbols_IncludesUndefineds) {
+ const std::string libC = lib("C");
+
+ SymbolEnumeratorOptions Opts;
+ Opts.FilterFlags =
+ SymbolEnumeratorOptions::IgnoreWeak |
+ SymbolEnumeratorOptions::IgnoreIndirect; // no IgnoreUndefined
+
+ std::vector<std::string> seen;
+ auto onEach = [&](llvm::StringRef sym) -> EnumerateResult {
+ seen.emplace_back(sym.str());
+ return EnumerateResult::Continue;
+ };
+
+ ASSERT_TRUE(SymbolEnumerator::enumerateSymbols(libC, onEach, Opts));
+
+ // Now we should see both sayC (export) and the undefined refs sayA, sayB,
+ // sayZ
+ EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayC");
+ }));
+ EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayA");
+ }));
+ EXPECT_TRUE(any_of(seen, [&](const std::string &s) {
+ return matchesEitherUnderscore(s, "sayB");
+ }));
+}
+
+// Full resolution via LibraryResolutionDriver/LibraryResolver ---
+TEST_F(LibraryResolverIT, DriverResolvesSymbolsToCorrectLibraries) {
+ // Create the resolver from real base paths (our fixtures dir)
+ auto Stup = LibraryResolver::Setup::create({BaseDir});
+
+ // Full system behavior: no mocks
+ auto Driver = LibraryResolutionDriver::create(Stup);
+ ASSERT_NE(Driver, nullptr);
+
+ // Tell the Driver about the scan path kinds (User/System) as your
+ // production code expects.
+ Driver->addScanPath(libdir("A"), PathType::User);
+ Driver->addScanPath(libdir("B"), PathType::User);
+ Driver->addScanPath(libdir("Z"), PathType::User);
+
+ // Symbols to resolve (bare names; class handles underscore differences
+ // internally)
+ std::vector<std::string> Syms = {platformSymbolName("sayA"),
+ platformSymbolName("sayB"),
+ platformSymbolName("sayZ")};
+
+ bool CallbackRan = false;
+ Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+ CallbackRan = true;
+
+ // sayA should resolve to A.dylib
+ {
+ auto lib = Q.getResolvedLib(platformSymbolName("sayA"));
+ ASSERT_TRUE(lib.has_value()) << "sayA should be resolved";
+ EXPECT_TRUE(endsWith(lib->str(), libname("A")))
+ << "sayA resolved to: " << lib->str();
+ }
+
+ // sayB should resolve to B.dylib
+ {
+ auto lib = Q.getResolvedLib(platformSymbolName("sayB"));
+ ASSERT_TRUE(lib.has_value()) << "sayB should be resolved";
+ EXPECT_TRUE(endsWith(lib->str(), libname("B")))
+ << "sayB resolved to: " << lib->str();
+ }
+
+ // sayZ should resolve to B.dylib
+ {
+ auto lib = Q.getResolvedLib(platformSymbolName("sayZ"));
+ ASSERT_TRUE(lib.has_value()) << "sayZ should be resolved";
+ EXPECT_TRUE(endsWith(lib->str(), libname("Z")))
+ << "sayZ resolved to: " << lib->str();
+ }
+
+ EXPECT_TRUE(Q.allResolved());
+ });
+
+ EXPECT_TRUE(CallbackRan);
+}
+
+// stress SymbolQuery with the real resolve flow
+// And resolve libC dependency libA, libB, libZ ---
+TEST_F(LibraryResolverIT, ResolveManySymbols) {
+ auto Stup = LibraryResolver::Setup::create({BaseDir});
+ auto Driver = LibraryResolutionDriver::create(Stup);
+ ASSERT_NE(Driver, nullptr);
+ Driver->addScanPath(libdir("C"), PathType::User);
+
+ // Many duplicates to provoke concurrent updates inside SymbolQuery
+ std::vector<std::string> Syms = {
+ platformSymbolName("sayA"), platformSymbolName("sayB"),
+ platformSymbolName("sayA"), platformSymbolName("sayB"),
+ platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+ platformSymbolName("sayZ"), platformSymbolName("sayZ"),
+ platformSymbolName("sayA"), platformSymbolName("sayB"),
+ platformSymbolName("sayA"), platformSymbolName("sayB")};
+
+ bool CallbackRan = false;
+ Driver->resolveSymbols(Syms, [&](SymbolQuery &Q) {
+ CallbackRan = true;
+ EXPECT_TRUE(Q.isResolved(platformSymbolName("sayA")));
+ EXPECT_TRUE(Q.isResolved(platformSymbolName("sayB")));
+ EXPECT_TRUE(Q.isResolved(platformSymbolName("sayZ")));
+
+ auto A = Q.getResolvedLib(platformSymbolName("sayA"));
+ auto B = Q.getResolvedLib(platformSymbolName("sayB"));
+ auto Z = Q.getResolvedLib(platformSymbolName("sayZ"));
+ ASSERT_TRUE(A.has_value());
+ ASSERT_TRUE(B.has_value());
+ ASSERT_TRUE(Z.has_value());
+ EXPECT_TRUE(endsWith(A->str(), libname("A")));
+ EXPECT_TRUE(endsWith(B->str(), libname("B")));
+ EXPECT_TRUE(endsWith(Z->str(), libname("Z")));
+ EXPECT_TRUE(Q.allResolved());
+ });
+
+ EXPECT_TRUE(CallbackRan);
+}
+
+TEST_F(LibraryResolverIT, ScanAndResolveDependencyGraph) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+ LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+ ScanH.addBasePath(libdir("C"), PathType::User);
+
+ LibraryManager LibMgr;
+ LibraryScanner Scanner(ScanH, LibMgr);
+
+ Scanner.scanNext(PathType::User, 0);
+
+ size_t numLibs = 0;
+ LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+ numLibs++;
+ return true;
+ });
+
+ EXPECT_GT(numLibs, 0u) << "Expected at least one library scanned";
+
+ // Validate that each scanned library path is resolvable
+ std::error_code EC;
+ LibMgr.forEachLibrary([&](const LibraryInfo &L) {
+ auto R = PResolver->resolve(L.getFullPath(), EC);
+ EXPECT_TRUE(R.has_value());
+ EXPECT_FALSE(EC);
+ return true;
+ });
+}
+
+TEST_F(LibraryResolverIT, ScanEmptyPath) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+ LibraryScanHelper ScanH({}, LibPathCache, PResolver);
+
+ ScanH.addBasePath("/tmp/empty", PathType::User);
+
+ LibraryManager LibMgr;
+ LibraryScanner Scanner(ScanH, LibMgr);
+
+ Scanner.scanNext(PathType::User, 0);
+
+ size_t count = 0;
+ LibMgr.forEachLibrary([&](const LibraryInfo &) {
+ count++;
+ return true;
+ });
+ EXPECT_EQ(count, 0u);
+}
+
+TEST_F(LibraryResolverIT, PathResolverResolvesKnownPaths) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ std::error_code EC;
+ auto Missing = PResolver->resolve("temp/foo/bar", EC);
+ EXPECT_FALSE(Missing.has_value()) << "Unexpectedly resolved a bogus path";
+ EXPECT_TRUE(EC) << "Expected error resolving path";
+
+ auto DirPath = PResolver->resolve(BaseDir, EC);
+ ASSERT_TRUE(DirPath.has_value());
+ EXPECT_FALSE(EC) << "Expected no error resolving path";
+ EXPECT_EQ(*DirPath, BaseDir);
+
+ auto DylibPath = PResolver->resolve(lib("C"), EC);
+ ASSERT_TRUE(DylibPath.has_value());
+ EXPECT_FALSE(EC) << "Expected no error resolving path";
+ EXPECT_EQ(*DylibPath, lib("C"));
+}
+
+TEST_F(LibraryResolverIT, PathResolverNormalizesDotAndDotDot) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ std::error_code EC;
+
+ // e.g. BaseDir + "/./C/../C/C.dylib" → BaseDir + "/C.dylib"
+ std::string Messy = BaseDir + "/C/./../C/./libC" + ext;
+ auto Resolved = PResolver->resolve(Messy, EC);
+ ASSERT_TRUE(Resolved.has_value());
+ EXPECT_FALSE(EC);
+ EXPECT_EQ(*Resolved, lib("C")) << "Expected realpath to collapse . and ..";
+}
+
+#if !defined(_WIN32)
+TEST_F(LibraryResolverIT, PathResolverFollowsSymlinks) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ std::error_code EC;
+
+ // Create a symlink temp -> BaseDir (only if filesystem allows it)
+ std::string linkName = BaseDir + withext("/link_to_C");
+ std::string target = lib("C");
+ if (::symlink(target.c_str(), linkName.c_str()) != 0)
+ GTEST_SKIP() << "Failed to create symlink: " << strerror(errno);
+
+ auto resolved = PResolver->resolve(linkName, EC);
+ ASSERT_TRUE(resolved.has_value());
+ EXPECT_FALSE(EC);
+ EXPECT_EQ(*resolved, target);
+
+ (void)::unlink(linkName.c_str()); // cleanup
+}
+
+TEST_F(LibraryResolverIT, PathResolverCachesResults) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ SmallString<128> TmpDylib;
+ std::error_code EC;
+ EC = sys::fs::createUniqueFile(withext("A-copy"), TmpDylib);
+ if (EC)
+ GTEST_SKIP() << "Failed to create temp dylib" << EC.message();
+
+ EC = sys::fs::copy_file(lib("A"), TmpDylib);
+ if (EC)
+ GTEST_SKIP() << "Failed to copy libA: " << EC.message();
+ EC.clear();
+
+ // First resolve -> should populate LibPathCache
+ auto first = PResolver->resolve(TmpDylib, EC);
+ ASSERT_TRUE(first.has_value());
+
+ // Forcefully remove the file from disk
+ (void)::unlink(TmpDylib.c_str());
+
+ // Second resolve -> should still succeed from LibPathCache
+ auto second = PResolver->resolve(TmpDylib, EC);
+ EXPECT_TRUE(second.has_value());
+ EXPECT_EQ(*second, *first);
+}
+#endif
+
+TEST_F(LibraryResolverIT, LoaderPathSubstitutionAndResolve) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ DylibSubstitutor substitutor;
+ substitutor.configure(libdir("C"));
+#if defined(__APPLE__)
+ // Substitute @loader_path with BaseDir
+ std::string substituted =
+ substitutor.substitute(withext("@loader_path/libC"));
+#elif defined(__linux__)
+ // Substitute $origin with BaseDir
+ std::string substituted = substitutor.substitute(withext("$ORIGIN/libC"));
+#endif
+ ASSERT_FALSE(substituted.empty());
+ EXPECT_EQ(substituted, lib("C"));
+
+ // Now try resolving the substituted path
+ std::error_code EC;
+ auto resolved = PResolver->resolve(substituted, EC);
+ ASSERT_TRUE(resolved.has_value()) << "Expected to resolve substituted dylib";
+ EXPECT_EQ(*resolved, lib("C"));
+ EXPECT_FALSE(EC) << "Expected no error resolving substituted dylib";
+}
+
+TEST_F(LibraryResolverIT, ResolveFromUsrOrSystemPaths) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ DylibPathValidator validator(*PResolver);
+
+ std::vector<std::string> Paths = {"/foo/bar/", "temp/foo", libdir("C"),
+ libdir("A"), libdir("B"), libdir("Z")};
+
+ SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+ DylibResolver Resolver(validator);
+ Resolver.configure("", {{P, SearchPathType::UsrOrSys}});
+
+ // Check "C"
+ auto ValOptC = Resolver.resolve("libC", true);
+ EXPECT_TRUE(ValOptC.has_value());
+ EXPECT_EQ(*ValOptC, lib("C"));
+
+ auto ValOptCdylib = Resolver.resolve(withext("libC"));
+ EXPECT_TRUE(ValOptCdylib.has_value());
+ EXPECT_EQ(*ValOptCdylib, lib("C"));
+
+ // Check "A"
+ auto ValOptA = Resolver.resolve("libA", true);
+ EXPECT_TRUE(ValOptA.has_value());
+ EXPECT_EQ(*ValOptA, lib("A"));
+
+ auto ValOptAdylib = Resolver.resolve(withext("libA"));
+ EXPECT_TRUE(ValOptAdylib.has_value());
+ EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+ // Check "B"
+ auto ValOptB = Resolver.resolve("libB", true);
+ EXPECT_TRUE(ValOptB.has_value());
+ EXPECT_EQ(*ValOptB, lib("B"));
+
+ auto ValOptBdylib = Resolver.resolve(withext("libB"));
+ EXPECT_TRUE(ValOptBdylib.has_value());
+ EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+ // Check "Z"
+ auto ValOptZ = Resolver.resolve("libZ", true);
+ EXPECT_TRUE(ValOptZ.has_value());
+ EXPECT_EQ(*ValOptZ, lib("Z"));
+
+ auto ValOptZdylib = Resolver.resolve(withext("libZ"));
+ EXPECT_TRUE(ValOptZdylib.has_value());
+ EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+
+#if defined(__APPLE__)
+TEST_F(LibraryResolverIT, ResolveViaLoaderPathAndRPathSubstitution) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ DylibPathValidator validator(*PResolver);
+
+ std::vector<std::string> Paths = {"@loader_path/../A", "@loader_path/../B",
+ "@loader_path/../C", "@loader_path/../Z"};
+
+ SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+ DylibResolver Resolver(validator);
+
+ // Use only RPath config
+ Resolver.configure(lib("C"), {{P, SearchPathType::RPath}});
+
+ // --- Check A ---
+ auto ValOptA = Resolver.resolve("@rpath/libA", true);
+ EXPECT_TRUE(ValOptA.has_value());
+ EXPECT_EQ(*ValOptA, lib("A"));
+
+ auto ValOptAdylib = Resolver.resolve(withext("@rpath/libA"));
+ EXPECT_TRUE(ValOptAdylib.has_value());
+ EXPECT_EQ(*ValOptAdylib, lib("A"));
+
+ // --- Check B ---
+ auto ValOptB = Resolver.resolve("@rpath/libB", true);
+ EXPECT_TRUE(ValOptB.has_value());
+ EXPECT_EQ(*ValOptB, lib("B"));
+
+ auto ValOptBdylib = Resolver.resolve(withext("@rpath/libB"));
+ EXPECT_TRUE(ValOptBdylib.has_value());
+ EXPECT_EQ(*ValOptBdylib, lib("B"));
+
+ // --- Check Z ---
+ auto ValOptZ = Resolver.resolve("@rpath/libZ", true);
+ EXPECT_TRUE(ValOptZ.has_value());
+ EXPECT_EQ(*ValOptZ, lib("Z"));
+
+ auto ValOptZdylib = Resolver.resolve(withext("@rpath/libZ"));
+ EXPECT_TRUE(ValOptZdylib.has_value());
+ EXPECT_EQ(*ValOptZdylib, lib("Z"));
+}
+#endif
+
+#if defined(__linux__)
+TEST_F(LibraryResolverIT, ResolveViaOriginAndRPathSubstitution) {
+ auto LibPathCache = std::make_shared<LibraryPathCache>();
+ auto PResolver = std::make_shared<PathResolver>(LibPathCache);
+
+ DylibPathValidator validator(*PResolver);
+
+ // On Linux, $ORIGIN works like @loader_path
+ std::vector<std::string> Paths = {"$ORIGIN/../A", "$ORIGIN/../B",
+ "$ORIGIN/../C", "$ORIGIN/../Z"};
+
+ SmallVector<StringRef> P(Paths.begin(), Paths.end());
+
+ DylibResolver Resolver(validator);
+
+ // Use only RPath config
+ Resolver.configure(lib("C"), {{P, SearchPathType::RunPath}});
+
+ // --- Check A ---
+ auto ValOptA = Resolver.resolve("libA", true);
+ EXPECT_TRUE(ValOptA.has_value());
+ EXPECT_EQ(*ValOptA, lib("A"));
+
+ auto valOptASO = Resolver.resolve(withext("libA"));
+ EXPECT_TRUE(valOptASO.has_value());
+ EXPECT_EQ(*valOptASO, lib("A"));
+
+ // --- Check B ---
+ auto ValOptB = Resolver.resolve("libB", true);
+ EXPECT_TRUE(ValOptB.has_value());
+ EXPECT_EQ(*ValOptB, lib("B"));
+
+ auto valOptBSO = Resolver.resolve(withext("libB"));
+ EXPECT_TRUE(valOptBSO.has_value());
+ EXPECT_EQ(*valOptBSO, lib("B"));
+
+ // --- Check Z ---
+ auto ValOptZ = Resolver.resolve("libZ", true);
+ EXPECT_TRUE(ValOptZ.has_value());
+ EXPECT_EQ(*ValOptZ, lib("Z"));
+
+ auto valOptZSO = Resolver.resolve(withext("libZ"));
+ EXPECT_TRUE(valOptZSO.has_value());
+ EXPECT_EQ(*valOptZSO, lib("Z"));
+}
+#endif
+} // namespace
+#endif // defined(__APPLE__)
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
index 50ad4d5..4680282 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanVerifierTest.cpp
@@ -21,7 +21,7 @@ using VPVerifierTest = VPlanTestBase;
namespace {
TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
VPlan &Plan = getPlan();
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -56,7 +56,7 @@ TEST_F(VPVerifierTest, VPInstructionUseBeforeDefSameBB) {
TEST_F(VPVerifierTest, VPInstructionUseBeforeDefDifferentBB) {
VPlan &Plan = getPlan();
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
VPInstruction *DefI = new VPInstruction(Instruction::Add, {Zero});
VPInstruction *UseI = new VPInstruction(Instruction::Sub, {DefI});
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
@@ -184,7 +184,7 @@ TEST_F(VPVerifierTest, VPPhiIncomingValueDoesntDominateIncomingBlock) {
TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
VPlan &Plan = getPlan();
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
@@ -218,7 +218,7 @@ TEST_F(VPVerifierTest, DuplicateSuccessorsOutsideRegion) {
TEST_F(VPVerifierTest, DuplicateSuccessorsInsideRegion) {
VPlan &Plan = getPlan();
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
VPInstruction *I1 = new VPInstruction(Instruction::Add, {Zero});
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPInstruction *BranchOnCond =
@@ -259,7 +259,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
VPBasicBlock *VPBB1 = Plan.getEntry();
VPBasicBlock *VPBB2 = Plan.createVPBasicBlock("");
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
VPBB2->appendRecipe(CanIV);
@@ -288,7 +288,7 @@ TEST_F(VPVerifierTest, BlockOutsideRegionWithParent) {
TEST_F(VPVerifierTest, NonHeaderPHIInHeader) {
VPlan &Plan = getPlan();
- VPValue *Zero = Plan.getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(C), 0));
+ VPValue *Zero = Plan.getConstantInt(32, 0);
auto *CanIV = new VPCanonicalIVPHIRecipe(Zero, {});
auto *BranchOnCond = new VPInstruction(VPInstruction::BranchOnCond, {CanIV});
@@ -351,8 +351,7 @@ TEST_F(VPIRVerifierTest, testVerifyIRPhi) {
BasicBlock *LoopHeader = F->getEntryBlock().getSingleSuccessor();
auto Plan = buildVPlan(LoopHeader);
- Plan->getExitBlocks()[0]->front().addOperand(
- Plan->getOrAddLiveIn(ConstantInt::get(Type::getInt32Ty(*Ctx), 0)));
+ Plan->getExitBlocks()[0]->front().addOperand(Plan->getConstantInt(32, 0));
#if GTEST_HAS_STREAM_REDIRECTION
::testing::internal::CaptureStderr();
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 119303c..2dad16a 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -29,7 +29,7 @@ Version changelog:
'none' and 'all'. 'smart' is the default.
5: Basic block labels are matched by FileCheck expressions
6: The semantics of TBAA checks has been incorporated in the check lines.
-7: Indent switch-cases correctly.
+7: Indent switch-cases correctly; CHECK-EMPTY instead of skipping blank lines.
"""
DEFAULT_VERSION = 6
@@ -2280,6 +2280,14 @@ def add_checks(
# For IR output, change all defs to FileCheck variables, so we're immune
# to variable naming fashions.
else:
+ if ginfo.get_version() >= 7:
+ # Record the indices of blank lines in the function body preemptively.
+ blank_line_indices = {
+ i for i, line in enumerate(func_body) if line.strip() == ""
+ }
+ else:
+ blank_line_indices = set()
+
func_body = generalize_check_lines(
func_body,
ginfo,
@@ -2305,9 +2313,18 @@ def add_checks(
is_blank_line = False
- for func_line in func_body:
+ for idx, func_line in enumerate(func_body):
if func_line.strip() == "":
- is_blank_line = True
+ # We should distinguish if the line is a 'fake' blank line generated by
+ # generalize_check_lines removing comments.
+ # Fortunately, generalize_check_lines does not change the index of each line,
+ # we can record the indices of blank lines preemptively.
+ if idx in blank_line_indices:
+ output_lines.append(
+ "{} {}-EMPTY:".format(comment_marker, checkprefix)
+ )
+ else:
+ is_blank_line = True
continue
if not check_inst_comments:
# Do not waste time checking IR comments unless necessary.