diff options
566 files changed, 25499 insertions, 21850 deletions
diff --git a/.github/workflows/containers/github-action-ci/stage1.Dockerfile b/.github/workflows/containers/github-action-ci/stage1.Dockerfile index fbc4548..8c6bcf4 100644 --- a/.github/workflows/containers/github-action-ci/stage1.Dockerfile +++ b/.github/workflows/containers/github-action-ci/stage1.Dockerfile @@ -37,7 +37,7 @@ RUN cmake -B ./build -G Ninja ./llvm \ -DLLVM_ENABLE_RUNTIMES="compiler-rt" \ -DCMAKE_INSTALL_PREFIX="$LLVM_SYSROOT" \ -DLLVM_ENABLE_PROJECTS="bolt;clang;lld;clang-tools-extra" \ - -DLLVM_DISTRIBUTION_COMPONENTS="lld;compiler-rt;clang-format" \ + -DLLVM_DISTRIBUTION_COMPONENTS="lld;compiler-rt;clang-format;scan-build" \ -DCLANG_DEFAULT_LINKER="lld" \ -DBOOTSTRAP_CLANG_PGO_TRAINING_DATA_SOURCE_DIR=/llvm-project-llvmorg-$LLVM_VERSION/llvm diff --git a/bolt/include/bolt/Core/GDBIndex.h b/bolt/include/bolt/Core/GDBIndex.h new file mode 100644 index 0000000..6604c2a --- /dev/null +++ b/bolt/include/bolt/Core/GDBIndex.h @@ -0,0 +1,61 @@ +//===-- bolt/Core/GDBIndex.h - GDB Index support ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This file contains declaration of classes required for generation of +/// .gdb_index section. +/// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_CORE_GDB_INDEX_H +#define BOLT_CORE_GDB_INDEX_H + +#include "bolt/Core/BinaryContext.h" +#include <vector> + +namespace llvm { +namespace bolt { + +class GDBIndex { +public: + /// Contains information about TU so we can write out correct entries in GDB + /// index. + struct GDBIndexTUEntry { + uint64_t UnitOffset; + uint64_t TypeHash; + uint64_t TypeDIERelativeOffset; + }; + +private: + BinaryContext &BC; + + /// Entries for GDB Index Types CU List. + using GDBIndexTUEntryType = std::vector<GDBIndexTUEntry>; + GDBIndexTUEntryType GDBIndexTUEntryVector; + +public: + GDBIndex(BinaryContext &BC) : BC(BC) {} + + std::mutex GDBIndexMutex; + + /// Adds an GDBIndexTUEntry if .gdb_index section exists. + void addGDBTypeUnitEntry(const GDBIndexTUEntry &&Entry); + + /// Rewrite .gdb_index section if present. + void updateGdbIndexSection(const CUOffsetMap &CUMap, const uint32_t NumCUs, + DebugARangesSectionWriter &ARangesSectionWriter); + + /// Returns all entries needed for Types CU list. + const GDBIndexTUEntryType &getGDBIndexTUEntryVector() const { + return GDBIndexTUEntryVector; + } +}; + +} // namespace bolt +} // namespace llvm + +#endif diff --git a/bolt/lib/Core/CMakeLists.txt b/bolt/lib/Core/CMakeLists.txt index 441df9f..873cf67 100644 --- a/bolt/lib/Core/CMakeLists.txt +++ b/bolt/lib/Core/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMBOLTCore DynoStats.cpp Exceptions.cpp FunctionLayout.cpp + GDBIndex.cpp HashUtilities.cpp JumpTable.cpp MCPlusBuilder.cpp diff --git a/bolt/lib/Core/GDBIndex.cpp b/bolt/lib/Core/GDBIndex.cpp new file mode 100644 index 0000000..9e6d241 --- /dev/null +++ b/bolt/lib/Core/GDBIndex.cpp @@ -0,0 +1,185 @@ +//===- bolt/Core/GDBIndex.cpp - GDB Index support ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "bolt/Core/GDBIndex.h" + +using namespace llvm::bolt; +using namespace llvm::support::endian; + +void GDBIndex::addGDBTypeUnitEntry(const GDBIndexTUEntry &&Entry) { + std::lock_guard<std::mutex> Lock(GDBIndexMutex); + if (!BC.getGdbIndexSection()) + return; + GDBIndexTUEntryVector.emplace_back(Entry); +} + +void GDBIndex::updateGdbIndexSection( + const CUOffsetMap &CUMap, const uint32_t NumCUs, + DebugARangesSectionWriter &ARangesSectionWriter) { + if (!BC.getGdbIndexSection()) + return; + + // See https://sourceware.org/gdb/onlinedocs/gdb/Index-Section-Format.html + // for .gdb_index section format. + + StringRef GdbIndexContents = BC.getGdbIndexSection()->getContents(); + + const char *Data = GdbIndexContents.data(); + + // Parse the header. + const uint32_t Version = read32le(Data); + if (Version != 7 && Version != 8) { + errs() << "BOLT-ERROR: can only process .gdb_index versions 7 and 8\n"; + exit(1); + } + + // Some .gdb_index generators use file offsets while others use section + // offsets. Hence we can only rely on offsets relative to each other, + // and ignore their absolute values. + const uint32_t CUListOffset = read32le(Data + 4); + const uint32_t CUTypesOffset = read32le(Data + 8); + const uint32_t AddressTableOffset = read32le(Data + 12); + const uint32_t SymbolTableOffset = read32le(Data + 16); + const uint32_t ConstantPoolOffset = read32le(Data + 20); + Data += 24; + + // Map CUs offsets to indices and verify existing index table. + std::map<uint32_t, uint32_t> OffsetToIndexMap; + const uint32_t CUListSize = CUTypesOffset - CUListOffset; + const uint32_t TUListSize = AddressTableOffset - CUTypesOffset; + const unsigned NUmCUsEncoded = CUListSize / 16; + unsigned MaxDWARFVersion = BC.DwCtx->getMaxVersion(); + unsigned NumDWARF5TUs = + getGDBIndexTUEntryVector().size() - BC.DwCtx->getNumTypeUnits(); + bool SkipTypeUnits = false; + // For DWARF5 Types are in .debug_info. + // LLD doesn't generate Types CU List, and in CU list offset + // only includes CUs. + // GDB 11+ includes only CUs in CU list and generates Types + // list. + // GDB 9 includes CUs and TUs in CU list and generates TYpes + // list. The NumCUs is CUs + TUs, so need to modify the check. + // For split-dwarf + // GDB-11, DWARF5: TU units from dwo are not included. + // GDB-11, DWARF4: TU units from dwo are included. + if (MaxDWARFVersion >= 5) + SkipTypeUnits = !TUListSize ? true + : ((NUmCUsEncoded + NumDWARF5TUs) == + BC.DwCtx->getNumCompileUnits()); + + if (!((CUListSize == NumCUs * 16) || + (CUListSize == (NumCUs + NumDWARF5TUs) * 16))) { + errs() << "BOLT-ERROR: .gdb_index: CU count mismatch\n"; + exit(1); + } + DenseSet<uint64_t> OriginalOffsets; + for (unsigned Index = 0, Units = BC.DwCtx->getNumCompileUnits(); + Index < Units; ++Index) { + const DWARFUnit *CU = BC.DwCtx->getUnitAtIndex(Index); + if (SkipTypeUnits && CU->isTypeUnit()) + continue; + const uint64_t Offset = read64le(Data); + Data += 16; + if (CU->getOffset() != Offset) { + errs() << "BOLT-ERROR: .gdb_index CU offset mismatch\n"; + exit(1); + } + + OriginalOffsets.insert(Offset); + OffsetToIndexMap[Offset] = Index; + } + + // Ignore old address table. + const uint32_t OldAddressTableSize = SymbolTableOffset - AddressTableOffset; + // Move Data to the beginning of symbol table. + Data += SymbolTableOffset - CUTypesOffset; + + // Calculate the size of the new address table. + uint32_t NewAddressTableSize = 0; + for (const auto &CURangesPair : ARangesSectionWriter.getCUAddressRanges()) { + const SmallVector<DebugAddressRange, 2> &Ranges = CURangesPair.second; + NewAddressTableSize += Ranges.size() * 20; + } + + // Difference between old and new table (and section) sizes. + // Could be negative. + int32_t Delta = NewAddressTableSize - OldAddressTableSize; + + size_t NewGdbIndexSize = GdbIndexContents.size() + Delta; + + // Free'd by ExecutableFileMemoryManager. + auto *NewGdbIndexContents = new uint8_t[NewGdbIndexSize]; + uint8_t *Buffer = NewGdbIndexContents; + + write32le(Buffer, Version); + write32le(Buffer + 4, CUListOffset); + write32le(Buffer + 8, CUTypesOffset); + write32le(Buffer + 12, AddressTableOffset); + write32le(Buffer + 16, SymbolTableOffset + Delta); + write32le(Buffer + 20, ConstantPoolOffset + Delta); + Buffer += 24; + + using MapEntry = std::pair<uint32_t, CUInfo>; + std::vector<MapEntry> CUVector(CUMap.begin(), CUMap.end()); + // Need to sort since we write out all of TUs in .debug_info before CUs. + std::sort(CUVector.begin(), CUVector.end(), + [](const MapEntry &E1, const MapEntry &E2) -> bool { + return E1.second.Offset < E2.second.Offset; + }); + // Writing out CU List <Offset, Size> + for (auto &CUInfo : CUVector) { + // Skipping TU for DWARF5 when they are not included in CU list. + if (!OriginalOffsets.count(CUInfo.first)) + continue; + write64le(Buffer, CUInfo.second.Offset); + // Length encoded in CU doesn't contain first 4 bytes that encode length. + write64le(Buffer + 8, CUInfo.second.Length + 4); + Buffer += 16; + } + + // Rewrite TU CU List, since abbrevs can be different. + // Entry example: + // 0: offset = 0x00000000, type_offset = 0x0000001e, type_signature = + // 0x418503b8111e9a7b Spec says " triplet, the first value is the CU offset, + // the second value is the type offset in the CU, and the third value is the + // type signature" Looking at what is being generated by gdb-add-index. The + // first entry is TU offset, second entry is offset from it, and third entry + // is the type signature. + if (TUListSize) + for (const GDBIndexTUEntry &Entry : getGDBIndexTUEntryVector()) { + write64le(Buffer, Entry.UnitOffset); + write64le(Buffer + 8, Entry.TypeDIERelativeOffset); + write64le(Buffer + 16, Entry.TypeHash); + Buffer += sizeof(GDBIndexTUEntry); + } + + // Generate new address table. + for (const std::pair<const uint64_t, DebugAddressRangesVector> &CURangesPair : + ARangesSectionWriter.getCUAddressRanges()) { + const uint32_t CUIndex = OffsetToIndexMap[CURangesPair.first]; + const DebugAddressRangesVector &Ranges = CURangesPair.second; + for (const DebugAddressRange &Range : Ranges) { + write64le(Buffer, Range.LowPC); + write64le(Buffer + 8, Range.HighPC); + write32le(Buffer + 16, CUIndex); + Buffer += 20; + } + } + + const size_t TrailingSize = + GdbIndexContents.data() + GdbIndexContents.size() - Data; + assert(Buffer + TrailingSize == NewGdbIndexContents + NewGdbIndexSize && + "size calculation error"); + + // Copy over the rest of the original data. + memcpy(Buffer, Data, TrailingSize); + + // Register the new section. + BC.registerOrUpdateNoteSection(".gdb_index", NewGdbIndexContents, + NewGdbIndexSize); +} diff --git a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt index 35e29b9..36fcd8f 100644 --- a/clang-tools-extra/clang-tidy/misc/CMakeLists.txt +++ b/clang-tools-extra/clang-tidy/misc/CMakeLists.txt @@ -43,7 +43,6 @@ add_clang_library(clangTidyMiscModule UseAnonymousNamespaceCheck.cpp LINK_LIBS - clangAnalysis clangTidy clangTidyUtils diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp index bbc1b47..bf7a847 100644 --- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp @@ -96,9 +96,14 @@ AST_MATCHER(QualType, isIntegralType) { AST_MATCHER_P(UserDefinedLiteral, hasLiteral, clang::ast_matchers::internal::Matcher<Expr>, InnerMatcher) { - if (const Expr *CookedLiteral = Node.getCookedLiteral()) { + const UserDefinedLiteral::LiteralOperatorKind LOK = + Node.getLiteralOperatorKind(); + if (LOK == UserDefinedLiteral::LOK_Template || + LOK == UserDefinedLiteral::LOK_Raw) + return false; + + if (const Expr *CookedLiteral = Node.getCookedLiteral()) return InnerMatcher.matches(*CookedLiteral, Finder, Builder); - } return false; } diff --git a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp index 015347e..601ff44 100644 --- a/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp +++ b/clang-tools-extra/clang-tidy/readability/RedundantMemberInitCheck.cpp @@ -41,25 +41,35 @@ void RedundantMemberInitCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) { void RedundantMemberInitCheck::registerMatchers(MatchFinder *Finder) { auto ConstructorMatcher = - cxxConstructExpr(argumentCountIs(0), - hasDeclaration(cxxConstructorDecl(ofClass(cxxRecordDecl( - unless(isTriviallyDefaultConstructible())))))) + cxxConstructExpr( + argumentCountIs(0), + hasDeclaration(cxxConstructorDecl( + ofClass(cxxRecordDecl(unless(isTriviallyDefaultConstructible())) + .bind("class"))))) .bind("construct"); + auto HasUnionAsParent = hasParent(recordDecl(isUnion())); + + auto HasTypeEqualToConstructorClass = hasType(qualType( + hasCanonicalType(qualType(hasDeclaration(equalsBoundNode("class")))))); + Finder->addMatcher( cxxConstructorDecl( unless(isDelegatingConstructor()), ofClass(unless(isUnion())), forEachConstructorInitializer( - cxxCtorInitializer(withInitializer(ConstructorMatcher), - unless(forField(fieldDecl( - anyOf(hasType(isConstQualified()), - hasParent(recordDecl(isUnion()))))))) + cxxCtorInitializer( + withInitializer(ConstructorMatcher), + anyOf(isBaseInitializer(), + forField(fieldDecl(unless(hasType(isConstQualified())), + unless(HasUnionAsParent), + HasTypeEqualToConstructorClass)))) .bind("init"))) .bind("constructor"), this); Finder->addMatcher(fieldDecl(hasInClassInitializer(ConstructorMatcher), - unless(hasParent(recordDecl(isUnion())))) + HasTypeEqualToConstructorClass, + unless(HasUnionAsParent)) .bind("field"), this); } diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst index 33b65ca..661b2b1 100644 --- a/clang-tools-extra/docs/ReleaseNotes.rst +++ b/clang-tools-extra/docs/ReleaseNotes.rst @@ -376,6 +376,7 @@ Changes in existing checks - Improved :doc:`readability-container-size-empty <clang-tidy/checks/readability/container-size-empty>` check to prevent false positives when utilizing ``size`` or ``length`` methods that accept parameter. + Fixed crash when facing template user defined literals. - Improved :doc:`readability-duplicate-include <clang-tidy/checks/readability/duplicate-include>` check by excluding include @@ -403,6 +404,11 @@ Changes in existing checks <clang-tidy/checks/readability/redundant-inline-specifier>` check to properly emit warnings for static data member with an in-class initializer. +- Improved :doc:`readability-redundant-member-init + <clang-tidy/checks/readability/redundant-member-init>` check to avoid + false-positives when type of the member does not match the type of the + initializer. + - Improved :doc:`readability-static-accessed-through-instance <clang-tidy/checks/readability/static-accessed-through-instance>` check to support calls to overloaded operators as base expression and provide fixes to diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp index ecaf97f..4675527 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/container-size-empty.cpp @@ -889,3 +889,9 @@ namespace PR88203 { // CHECK-FIXES: {{^ }}if (s.empty()) {}{{$}} } } + +namespace PR94454 { + template <char...> + int operator""_ci() { return 0; } + auto eq = 0_ci == 0; +} diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-member-init.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-member-init.cpp index 17b2714..6f18a60 100644 --- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-member-init.cpp +++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-member-init.cpp @@ -302,3 +302,19 @@ struct D7 { D7<int> d7i; D7<S> d7s; + +struct SS { + SS() = default; + SS(S s) : s(s) {} + + S s; +}; + +struct D8 { + SS ss = S(); +}; + +struct D9 { + D9() : ss(S()) {} + SS ss; +}; diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 46f99d0..a49e412 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4016,6 +4016,30 @@ Note that the `size` argument must be a compile time constant. Note that this intrinsic cannot yet be called in a ``constexpr`` context. +``__is_bitwise_cloneable`` +-------------------------- + +A type trait is used to check whether a type can be safely copied by memcpy. + +**Syntax**: + +.. code-block:: c++ + + bool __is_bitwise_cloneable(Type) + +**Description**: + +Objects of bitwise cloneable types can be bitwise copied by memcpy/memmove. The +Clang compiler warrants that this behavior is well defined, and won't be +broken by compiler optimizations and sanitizers. + +For implicit-lifetime types, the lifetime of the new object is implicitly +started after the copy. For other types (e.g., classes with virtual methods), +the lifetime isn't started, and using the object results in undefined behavior +according to the C++ Standard. + +This builtin can be used in constant expressions. + Atomic Min/Max builtins with memory ordering -------------------------------------------- diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 69ac081..b9c9070 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -340,6 +340,9 @@ Non-comprehensive list of changes in this release ``-Winvalid-constexpr`` is not enabled for the function definition, which should result in mild compile-time performance improvements. +- Added ``__is_bitwise_cloneable`` which is used to check whether a type + can be safely copied by memcpy/memmove. + New Compiler Flags ------------------ - ``-fsanitize=implicit-bitfield-conversion`` checks implicit truncation and diff --git a/clang/include/clang/AST/OpenACCClause.h b/clang/include/clang/AST/OpenACCClause.h index a4c82cd..ea1ffbc 100644 --- a/clang/include/clang/AST/OpenACCClause.h +++ b/clang/include/clang/AST/OpenACCClause.h @@ -867,7 +867,7 @@ public: case OpenACCClauseKind::CLAUSE_NAME: \ Visit##CLAUSE_NAME##Clause(*cast<OpenACC##CLAUSE_NAME##Clause>(C)); \ return; -#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME) \ +#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME, DEPRECATED) \ case OpenACCClauseKind::ALIAS_NAME: \ Visit##CLAUSE_NAME##Clause(*cast<OpenACC##CLAUSE_NAME##Clause>(C)); \ return; diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 263b632df..9eb3f6c 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -1120,6 +1120,20 @@ public: /// Return true if this is a trivially copyable type (C++0x [basic.types]p9) bool isTriviallyCopyableType(const ASTContext &Context) const; + /// Return true if the type is safe to bitwise copy using memcpy/memmove. + /// + /// This is an extension in clang: bitwise cloneable types act as trivially + /// copyable types, meaning their underlying bytes can be safely copied by + /// memcpy or memmove. After the copy, the destination object has the same + /// object representation. + /// + /// However, there are cases where it is not safe to copy: + /// - When sanitizers, such as AddressSanitizer, add padding with poison, + /// which can cause issues if those poisoned padding bits are accessed. + /// - Types with Objective-C lifetimes, where specific runtime + /// semantics may not be preserved during a bitwise copy. + bool isBitwiseCloneableType(const ASTContext &Context) const; + /// Return true if this is a trivially copyable type bool isTriviallyCopyConstructibleType(const ASTContext &Context) const; diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 433c779..9e6800e 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -240,7 +240,7 @@ TARGET_BUILTIN(__builtin_amdgcn_flat_atomic_fadd_v2bf16, "V2sV2s*0V2s", "t", "at TARGET_BUILTIN(__builtin_amdgcn_global_atomic_fadd_v2bf16, "V2sV2s*1V2s", "t", "atomic-global-pk-add-bf16-inst") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2bf16, "V2sV2s*3V2s", "t", "atomic-ds-pk-add-16-insts") TARGET_BUILTIN(__builtin_amdgcn_ds_atomic_fadd_v2f16, "V2hV2h*3V2h", "t", "atomic-ds-pk-add-16-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3UiiUi", "t", "gfx940-insts") +TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940-insts") //===----------------------------------------------------------------------===// // Deep learning builtins. diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index d15171d..0d5e38e 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -124,6 +124,7 @@ enum class CudaArch { GFX1103, GFX1150, GFX1151, + GFX1152, GFX12_GENERIC, GFX1200, GFX1201, diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index 8774514..9f0b6f5 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10082,6 +10082,12 @@ def warn_new_dangling_initializer_list : Warning< "the allocated initializer list}0 " "will be destroyed at the end of the full-expression">, InGroup<DanglingInitializerList>; +def warn_unsupported_lifetime_extension : Warning< + "lifetime extension of " + "%select{temporary|backing array of initializer list}0 created " + "by aggregate initialization using a default member initializer " + "is not yet supported; lifetime of %select{temporary|backing array}0 " + "will end at the end of the full-expression">, InGroup<Dangling>; // For non-floating point, expressions of the form x == x or x != x // should result in a warning, since these always evaluate to a constant. diff --git a/clang/include/clang/Basic/OpenACCClauses.def b/clang/include/clang/Basic/OpenACCClauses.def index 53f4cd1..85f4859 100644 --- a/clang/include/clang/Basic/OpenACCClauses.def +++ b/clang/include/clang/Basic/OpenACCClauses.def @@ -15,31 +15,31 @@ // // VISIT_CLAUSE(CLAUSE_NAME) // -// CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME) +// CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME, DEPRECATED) #ifndef CLAUSE_ALIAS -#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME) +#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME, false) #endif VISIT_CLAUSE(Auto) VISIT_CLAUSE(Async) VISIT_CLAUSE(Attach) VISIT_CLAUSE(Copy) -CLAUSE_ALIAS(PCopy, Copy) -CLAUSE_ALIAS(PresentOrCopy, Copy) +CLAUSE_ALIAS(PCopy, Copy, true) +CLAUSE_ALIAS(PresentOrCopy, Copy, true) VISIT_CLAUSE(CopyIn) -CLAUSE_ALIAS(PCopyIn, CopyIn) -CLAUSE_ALIAS(PresentOrCopyIn, CopyIn) +CLAUSE_ALIAS(PCopyIn, CopyIn, true) +CLAUSE_ALIAS(PresentOrCopyIn, CopyIn, true) VISIT_CLAUSE(CopyOut) -CLAUSE_ALIAS(PCopyOut, CopyOut) -CLAUSE_ALIAS(PresentOrCopyOut, CopyOut) +CLAUSE_ALIAS(PCopyOut, CopyOut, true) +CLAUSE_ALIAS(PresentOrCopyOut, CopyOut, true) VISIT_CLAUSE(Create) -CLAUSE_ALIAS(PCreate, Create) -CLAUSE_ALIAS(PresentOrCreate, Create) +CLAUSE_ALIAS(PCreate, Create, true) +CLAUSE_ALIAS(PresentOrCreate, Create, true) VISIT_CLAUSE(Default) VISIT_CLAUSE(DevicePtr) VISIT_CLAUSE(DeviceType) -CLAUSE_ALIAS(DType, DeviceType) +CLAUSE_ALIAS(DType, DeviceType, false) VISIT_CLAUSE(FirstPrivate) VISIT_CLAUSE(If) VISIT_CLAUSE(Independent) diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def index b5a0e9d..9c4b174 100644 --- a/clang/include/clang/Basic/TokenKinds.def +++ b/clang/include/clang/Basic/TokenKinds.def @@ -542,6 +542,8 @@ TYPE_TRAIT_2(__reference_converts_from_temporary, ReferenceConvertsFromTemporary // is not exposed to users. TYPE_TRAIT_2(/*EmptySpellingName*/, IsDeducible, KEYCXX) +TYPE_TRAIT_1(__is_bitwise_cloneable, IsBitwiseCloneable, KEYALL) + // Embarcadero Expression Traits EXPRESSION_TRAIT(__is_lvalue_expr, IsLValueExpr, KEYCXX) EXPRESSION_TRAIT(__is_rvalue_expr, IsRValueExpr, KEYCXX) diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td index cca4367..a0820e2 100644 --- a/clang/include/clang/Basic/riscv_vector.td +++ b/clang/include/clang/Basic/riscv_vector.td @@ -2637,7 +2637,8 @@ let UnMaskedPolicyScheme = HasPassthruOperand in { defm vbrev : RVVOutBuiltinSetZvbb; defm vclz : RVVOutBuiltinSetZvbb; defm vctz : RVVOutBuiltinSetZvbb; - defm vcpopv : RVVOutBuiltinSetZvbb; + let IRName = "vcpopv", MaskedIRName = "vcpopv_mask" in + defm vcpop : RVVOutBuiltinSetZvbb; let OverloadedName = "vwsll" in defm vwsll : RVVSignedWidenBinBuiltinSetVwsll; } diff --git a/clang/include/clang/Lex/DependencyDirectivesScanner.h b/clang/include/clang/Lex/DependencyDirectivesScanner.h index 2f8354d..0e11590 100644 --- a/clang/include/clang/Lex/DependencyDirectivesScanner.h +++ b/clang/include/clang/Lex/DependencyDirectivesScanner.h @@ -17,7 +17,6 @@ #ifndef LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSCANNER_H #define LLVM_CLANG_LEX_DEPENDENCYDIRECTIVESSCANNER_H -#include "clang/Basic/LangOptions.h" #include "clang/Basic/SourceLocation.h" #include "llvm/ADT/ArrayRef.h" @@ -118,7 +117,7 @@ struct Directive { bool scanSourceForDependencyDirectives( StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, SmallVectorImpl<dependency_directives_scan::Directive> &Directives, - const LangOptions &LangOpts, DiagnosticsEngine *Diags = nullptr, + DiagnosticsEngine *Diags = nullptr, SourceLocation InputSourceLoc = SourceLocation()); /// Print the previously scanned dependency directives as minimized source text. diff --git a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h index 9dc2006..f7b4510 100644 --- a/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h +++ b/clang/include/clang/Tooling/DependencyScanning/DependencyScanningFilesystem.h @@ -363,8 +363,7 @@ public: /// /// Returns true if the directive tokens are populated for this file entry, /// false if not (i.e. this entry is not a file or its scan fails). - bool ensureDirectiveTokensArePopulated(EntryRef Entry, - const LangOptions &LangOpts); + bool ensureDirectiveTokensArePopulated(EntryRef Entry); /// Check whether \p Path exists. By default checks cached result of \c /// status(), and falls back on FS if unable to do so. diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt index 3faefb5..a5d3dac 100644 --- a/clang/lib/AST/CMakeLists.txt +++ b/clang/lib/AST/CMakeLists.txt @@ -87,6 +87,7 @@ add_clang_library(clangAST Interp/Record.cpp Interp/Source.cpp Interp/State.cpp + Interp/MemberPointer.cpp Interp/InterpShared.cpp ItaniumCXXABI.cpp ItaniumMangle.cpp diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp index 3671c41..d124248 100644 --- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp +++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp @@ -100,6 +100,35 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) { return this->emitMemcpy(CE); } + case CK_DerivedToBaseMemberPointer: { + assert(classifyPrim(CE->getType()) == PT_MemberPtr); + assert(classifyPrim(SubExpr->getType()) == PT_MemberPtr); + const auto *FromMP = SubExpr->getType()->getAs<MemberPointerType>(); + const auto *ToMP = CE->getType()->getAs<MemberPointerType>(); + + unsigned DerivedOffset = collectBaseOffset(QualType(ToMP->getClass(), 0), + QualType(FromMP->getClass(), 0)); + + if (!this->visit(SubExpr)) + return false; + + return this->emitGetMemberPtrBasePop(DerivedOffset, CE); + } + + case CK_BaseToDerivedMemberPointer: { + assert(classifyPrim(CE) == PT_MemberPtr); + assert(classifyPrim(SubExpr) == PT_MemberPtr); + const auto *FromMP = SubExpr->getType()->getAs<MemberPointerType>(); + const auto *ToMP = CE->getType()->getAs<MemberPointerType>(); + + unsigned DerivedOffset = collectBaseOffset(QualType(FromMP->getClass(), 0), + QualType(ToMP->getClass(), 0)); + + if (!this->visit(SubExpr)) + return false; + return this->emitGetMemberPtrBasePop(-DerivedOffset, CE); + } + case CK_UncheckedDerivedToBase: case CK_DerivedToBase: { if (!this->visit(SubExpr)) @@ -187,7 +216,8 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) { return this->emitCastFloatingIntegral(*ToT, CE); } - case CK_NullToPointer: { + case CK_NullToPointer: + case CK_NullToMemberPointer: { if (DiscardResult) return true; @@ -326,7 +356,8 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) { return this->emitCast(*FromT, *ToT, CE); } - case CK_PointerToBoolean: { + case CK_PointerToBoolean: + case CK_MemberPointerToBoolean: { PrimType PtrT = classifyPrim(SubExpr->getType()); // Just emit p != nullptr for this. @@ -534,8 +565,23 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) { BO->isComparisonOp()) return this->emitComplexComparison(LHS, RHS, BO); - if (BO->isPtrMemOp()) - return this->visit(RHS); + if (BO->isPtrMemOp()) { + if (!this->visit(LHS)) + return false; + + if (!this->visit(RHS)) + return false; + + if (!this->emitToMemberPtr(BO)) + return false; + + if (classifyPrim(BO) == PT_MemberPtr) + return true; + + if (!this->emitCastMemberPtrPtr(BO)) + return false; + return DiscardResult ? this->emitPopPtr(BO) : true; + } // Typecheck the args. std::optional<PrimType> LT = classify(LHS->getType()); @@ -2773,6 +2819,8 @@ bool ByteCodeExprGen<Emitter>::visitZeroInitializer(PrimType T, QualType QT, return this->emitNullPtr(nullptr, E); case PT_FnPtr: return this->emitNullFnPtr(nullptr, E); + case PT_MemberPtr: + return this->emitNullMemberPtr(nullptr, E); case PT_Float: { return this->emitConstFloat(APFloat::getZero(Ctx.getFloatSemantics(QT)), E); } @@ -2875,6 +2923,7 @@ bool ByteCodeExprGen<Emitter>::emitConst(T Value, PrimType Ty, const Expr *E) { return this->emitConstBool(Value, E); case PT_Ptr: case PT_FnPtr: + case PT_MemberPtr: case PT_Float: case PT_IntAP: case PT_IntAPS: @@ -3188,7 +3237,7 @@ bool ByteCodeExprGen<Emitter>::visitAPValueInitializer(const APValue &Val, const APValue &F = Val.getStructField(I); const Record::Field *RF = R->getField(I); - if (F.isInt()) { + if (F.isInt() || F.isLValue()) { PrimType T = classifyPrim(RF->Decl->getType()); if (!this->visitAPValue(F, T, E)) return false; @@ -3308,10 +3357,27 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) { } } + std::optional<unsigned> CalleeOffset; // Add the (optional, implicit) This pointer. if (const auto *MC = dyn_cast<CXXMemberCallExpr>(E)) { - if (!this->visit(MC->getImplicitObjectArgument())) + if (!FuncDecl && classifyPrim(E->getCallee()) == PT_MemberPtr) { + // If we end up creating a CallPtr op for this, we need the base of the + // member pointer as the instance pointer, and later extract the function + // decl as the function pointer. + const Expr *Callee = E->getCallee(); + CalleeOffset = + this->allocateLocalPrimitive(Callee, PT_MemberPtr, true, false); + if (!this->visit(Callee)) + return false; + if (!this->emitSetLocal(PT_MemberPtr, *CalleeOffset, E)) + return false; + if (!this->emitGetLocal(PT_MemberPtr, *CalleeOffset, E)) + return false; + if (!this->emitGetMemberPtrBase(E)) + return false; + } else if (!this->visit(MC->getImplicitObjectArgument())) { return false; + } } llvm::BitVector NonNullArgs = collectNonNullArgs(FuncDecl, Args); @@ -3380,11 +3446,22 @@ bool ByteCodeExprGen<Emitter>::VisitCallExpr(const CallExpr *E) { for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I) ArgSize += align(primSize(classify(E->getArg(I)).value_or(PT_Ptr))); - if (!this->visit(E->getCallee())) - return false; + // Get the callee, either from a member pointer saved in CalleeOffset, + // or by just visiting the Callee expr. + if (CalleeOffset) { + if (!this->emitGetLocal(PT_MemberPtr, *CalleeOffset, E)) + return false; + if (!this->emitGetMemberPtrDecl(E)) + return false; + if (!this->emitCallPtr(ArgSize, E, E)) + return false; + } else { + if (!this->visit(E->getCallee())) + return false; - if (!this->emitCallPtr(ArgSize, E, E)) - return false; + if (!this->emitCallPtr(ArgSize, E, E)) + return false; + } } // Cleanup for discarded return values. @@ -3623,6 +3700,11 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) { return false; return DiscardResult ? this->emitPop(*T, E) : true; case UO_AddrOf: // &x + if (E->getType()->isMemberPointerType()) { + // C++11 [expr.unary.op]p3 has very strict rules on how the address of a + // member can be formed. + return this->emitGetMemberPtr(cast<DeclRefExpr>(SubExpr)->getDecl(), E); + } // We should already have a pointer when we get here. return this->delegate(SubExpr); case UO_Deref: // *x diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp index b0b22b0..98d1837 100644 --- a/clang/lib/AST/Interp/Context.cpp +++ b/clang/lib/AST/Interp/Context.cpp @@ -163,8 +163,12 @@ std::optional<PrimType> Context::classify(QualType T) const { if (T->isFloatingType()) return PT_Float; + if (T->isSpecificBuiltinType(BuiltinType::BoundMember) || + T->isMemberPointerType()) + return PT_MemberPtr; + if (T->isFunctionPointerType() || T->isFunctionReferenceType() || - T->isFunctionType() || T->isSpecificBuiltinType(BuiltinType::BoundMember)) + T->isFunctionType()) return PT_FnPtr; if (T->isReferenceType() || T->isPointerType() || @@ -177,9 +181,6 @@ std::optional<PrimType> Context::classify(QualType T) const { if (const auto *DT = dyn_cast<DecltypeType>(T)) return classify(DT->getUnderlyingType()); - if (const auto *DT = dyn_cast<MemberPointerType>(T)) - return classify(DT->getPointeeType()); - return std::nullopt; } @@ -292,10 +293,12 @@ unsigned Context::collectBaseOffset(const RecordDecl *BaseDecl, } if (CurDecl == FinalDecl) break; - - // break; } assert(OffsetSum > 0); return OffsetSum; } + +const Record *Context::getRecord(const RecordDecl *D) const { + return P->getOrCreateRecord(D); +} diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h index 360e949..c78dc9a 100644 --- a/clang/lib/AST/Interp/Context.h +++ b/clang/lib/AST/Interp/Context.h @@ -107,6 +107,8 @@ public: unsigned collectBaseOffset(const RecordDecl *BaseDecl, const RecordDecl *DerivedDecl) const; + const Record *getRecord(const RecordDecl *D) const; + private: /// Runs a function. bool Run(State &Parent, const Function *Func, APValue &Result); diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp index 746b765..d20ab13 100644 --- a/clang/lib/AST/Interp/Descriptor.cpp +++ b/clang/lib/AST/Interp/Descriptor.cpp @@ -11,6 +11,7 @@ #include "Floating.h" #include "FunctionPointer.h" #include "IntegralAP.h" +#include "MemberPointer.h" #include "Pointer.h" #include "PrimType.h" #include "Record.h" diff --git a/clang/lib/AST/Interp/Disasm.cpp b/clang/lib/AST/Interp/Disasm.cpp index 3f8a92e..0ab84d1 100644 --- a/clang/lib/AST/Interp/Disasm.cpp +++ b/clang/lib/AST/Interp/Disasm.cpp @@ -19,6 +19,7 @@ #include "Integral.h" #include "IntegralAP.h" #include "InterpFrame.h" +#include "MemberPointer.h" #include "Opcode.h" #include "PrimType.h" #include "Program.h" @@ -122,6 +123,8 @@ static const char *primTypeToString(PrimType T) { return "Ptr"; case PT_FnPtr: return "FnPtr"; + case PT_MemberPtr: + return "MemberPtr"; } llvm_unreachable("Unhandled PrimType"); } diff --git a/clang/lib/AST/Interp/Function.cpp b/clang/lib/AST/Interp/Function.cpp index 1d04998..00f5a1f 100644 --- a/clang/lib/AST/Interp/Function.cpp +++ b/clang/lib/AST/Interp/Function.cpp @@ -40,7 +40,8 @@ SourceInfo Function::getSource(CodePtr PC) const { unsigned Offset = PC - getCodeBegin(); using Elem = std::pair<unsigned, SourceInfo>; auto It = llvm::lower_bound(SrcMap, Elem{Offset, {}}, llvm::less_first()); - assert(It != SrcMap.end()); + if (It == SrcMap.end()) + return SrcMap.back().second; return It->second; } diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp index 145fa65..49015b1 100644 --- a/clang/lib/AST/Interp/Interp.cpp +++ b/clang/lib/AST/Interp/Interp.cpp @@ -373,6 +373,26 @@ bool CheckSubobject(InterpState &S, CodePtr OpPC, const Pointer &Ptr, return false; } +bool CheckDowncast(InterpState &S, CodePtr OpPC, const Pointer &Ptr, + uint32_t Offset) { + uint32_t MinOffset = Ptr.getDeclDesc()->getMetadataSize(); + uint32_t PtrOffset = Ptr.getByteOffset(); + + // We subtract Offset from PtrOffset. The result must be at least + // MinOffset. + if (Offset < PtrOffset && (PtrOffset - Offset) >= MinOffset) + return true; + + const auto *E = cast<CastExpr>(S.Current->getExpr(OpPC)); + QualType TargetQT = E->getType()->getPointeeType(); + QualType MostDerivedQT = Ptr.getDeclPtr().getType(); + + S.CCEDiag(E, diag::note_constexpr_invalid_downcast) + << MostDerivedQT << TargetQT; + + return false; +} + bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { assert(Ptr.isLive() && "Pointer is not live"); if (!Ptr.isConst()) @@ -493,10 +513,12 @@ bool CheckStore(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { bool CheckInvoke(InterpState &S, CodePtr OpPC, const Pointer &Ptr) { if (!CheckLive(S, OpPC, Ptr, AK_MemberCall)) return false; - if (!CheckExtern(S, OpPC, Ptr)) - return false; - if (!CheckRange(S, OpPC, Ptr, AK_MemberCall)) - return false; + if (!Ptr.isDummy()) { + if (!CheckExtern(S, OpPC, Ptr)) + return false; + if (!CheckRange(S, OpPC, Ptr, AK_MemberCall)) + return false; + } return true; } @@ -516,7 +538,7 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { return false; } - if (!F->isConstexpr()) { + if (!F->isConstexpr() || !F->hasBody()) { const SourceLocation &Loc = S.Current->getLocation(OpPC); if (S.getLangOpts().CPlusPlus11) { const FunctionDecl *DiagDecl = F->getDecl(); @@ -550,9 +572,10 @@ bool CheckCallable(InterpState &S, CodePtr OpPC, const Function *F) { S.checkingPotentialConstantExpression()) return false; - // If the declaration is defined _and_ declared 'constexpr', the below - // diagnostic doesn't add anything useful. - if (DiagDecl->isDefined() && DiagDecl->isConstexpr()) + // If the declaration is defined, declared 'constexpr' _and_ has a body, + // the below diagnostic doesn't add anything useful. + if (DiagDecl->isDefined() && DiagDecl->isConstexpr() && + DiagDecl->hasBody()) return false; S.FFDiag(Loc, diag::note_constexpr_invalid_function, 1) diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h index eca1792..98caea5 100644 --- a/clang/lib/AST/Interp/Interp.h +++ b/clang/lib/AST/Interp/Interp.h @@ -20,6 +20,7 @@ #include "InterpFrame.h" #include "InterpStack.h" #include "InterpState.h" +#include "MemberPointer.h" #include "Opcode.h" #include "PrimType.h" #include "Program.h" @@ -75,6 +76,11 @@ bool CheckRange(InterpState &S, CodePtr OpPC, const Pointer &Ptr, bool CheckSubobject(InterpState &S, CodePtr OpPC, const Pointer &Ptr, CheckSubobjectKind CSK); +/// Checks if the dowcast using the given offset is possible with the given +/// pointer. +bool CheckDowncast(InterpState &S, CodePtr OpPC, const Pointer &Ptr, + uint32_t Offset); + /// Checks if a pointer points to const storage. bool CheckConst(InterpState &S, CodePtr OpPC, const Pointer &Ptr); @@ -725,6 +731,9 @@ using CompareFn = llvm::function_ref<bool(ComparisonCategoryResult)>; template <typename T> bool CmpHelper(InterpState &S, CodePtr OpPC, CompareFn Fn) { + assert((!std::is_same_v<T, MemberPointer>) && + "Non-equality comparisons on member pointer types should already be " + "rejected in Sema."); using BoolT = PrimConv<PT_Bool>::T; const T &RHS = S.Stk.pop<T>(); const T &LHS = S.Stk.pop<T>(); @@ -834,6 +843,47 @@ inline bool CmpHelperEQ<Pointer>(InterpState &S, CodePtr OpPC, CompareFn Fn) { } } +template <> +inline bool CmpHelperEQ<MemberPointer>(InterpState &S, CodePtr OpPC, + CompareFn Fn) { + const auto &RHS = S.Stk.pop<MemberPointer>(); + const auto &LHS = S.Stk.pop<MemberPointer>(); + + // If either operand is a pointer to a weak function, the comparison is not + // constant. + for (const auto &MP : {LHS, RHS}) { + if (const CXXMethodDecl *MD = MP.getMemberFunction(); MD && MD->isWeak()) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_mem_pointer_weak_comparison) << MD; + return false; + } + } + + // C++11 [expr.eq]p2: + // If both operands are null, they compare equal. Otherwise if only one is + // null, they compare unequal. + if (LHS.isZero() && RHS.isZero()) { + S.Stk.push<Boolean>(Fn(ComparisonCategoryResult::Equal)); + return true; + } + if (LHS.isZero() || RHS.isZero()) { + S.Stk.push<Boolean>(Fn(ComparisonCategoryResult::Unordered)); + return true; + } + + // We cannot compare against virtual declarations at compile time. + for (const auto &MP : {LHS, RHS}) { + if (const CXXMethodDecl *MD = MP.getMemberFunction(); + MD && MD->isVirtual()) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.CCEDiag(Loc, diag::note_constexpr_compare_virtual_mem_ptr) << MD; + } + } + + S.Stk.push<Boolean>(Boolean::from(Fn(LHS.compare(RHS)))); + return true; +} + template <PrimType Name, class T = typename PrimConv<Name>::T> bool EQ(InterpState &S, CodePtr OpPC) { return CmpHelperEQ<T>(S, OpPC, [](ComparisonCategoryResult R) { @@ -1300,6 +1350,9 @@ inline bool GetPtrDerivedPop(InterpState &S, CodePtr OpPC, uint32_t Off) { return false; if (!CheckSubobject(S, OpPC, Ptr, CSK_Derived)) return false; + if (!CheckDowncast(S, OpPC, Ptr, Off)) + return false; + S.Stk.push<Pointer>(Ptr.atFieldSub(Off)); return true; } @@ -1324,6 +1377,12 @@ inline bool GetPtrBasePop(InterpState &S, CodePtr OpPC, uint32_t Off) { return true; } +inline bool GetMemberPtrBasePop(InterpState &S, CodePtr OpPC, int32_t Off) { + const auto &Ptr = S.Stk.pop<MemberPointer>(); + S.Stk.push<MemberPointer>(Ptr.atInstanceBase(Off)); + return true; +} + inline bool GetPtrThisBase(InterpState &S, CodePtr OpPC, uint32_t Off) { if (S.checkingPotentialConstantExpression()) return false; @@ -1532,6 +1591,24 @@ inline bool Memcpy(InterpState &S, CodePtr OpPC) { return DoMemcpy(S, OpPC, Src, Dest); } +inline bool ToMemberPtr(InterpState &S, CodePtr OpPC) { + const auto &Member = S.Stk.pop<MemberPointer>(); + const auto &Base = S.Stk.pop<Pointer>(); + + S.Stk.push<MemberPointer>(Member.takeInstance(Base)); + return true; +} + +inline bool CastMemberPtrPtr(InterpState &S, CodePtr OpPC) { + const auto &MP = S.Stk.pop<MemberPointer>(); + + if (std::optional<Pointer> Ptr = MP.toPointer(S.Ctx)) { + S.Stk.push<Pointer>(*Ptr); + return true; + } + return false; +} + //===----------------------------------------------------------------------===// // AddOffset, SubOffset //===----------------------------------------------------------------------===// @@ -1696,8 +1773,10 @@ inline bool SubPtr(InterpState &S, CodePtr OpPC) { return true; } - T A = T::from(LHS.getIndex()); - T B = T::from(RHS.getIndex()); + T A = LHS.isElementPastEnd() ? T::from(LHS.getNumElems()) + : T::from(LHS.getIndex()); + T B = RHS.isElementPastEnd() ? T::from(RHS.getNumElems()) + : T::from(RHS.getIndex()); return AddSubMulHelper<T, T::sub, std::minus>(S, OpPC, A.bitWidth(), A, B); } @@ -2115,7 +2194,7 @@ inline bool ArrayDecay(InterpState &S, CodePtr OpPC) { if (!CheckRange(S, OpPC, Ptr, CSK_ArrayToPointer)) return false; - if (!Ptr.isUnknownSizeArray() || Ptr.isDummy()) { + if (Ptr.isRoot() || !Ptr.isUnknownSizeArray() || Ptr.isDummy()) { S.Stk.push<Pointer>(Ptr.atIndex(0)); return true; } @@ -2329,6 +2408,28 @@ inline bool GetIntPtr(InterpState &S, CodePtr OpPC, const Descriptor *Desc) { return true; } +inline bool GetMemberPtr(InterpState &S, CodePtr OpPC, const Decl *D) { + S.Stk.push<MemberPointer>(D); + return true; +} + +inline bool GetMemberPtrBase(InterpState &S, CodePtr OpPC) { + const auto &MP = S.Stk.pop<MemberPointer>(); + + S.Stk.push<Pointer>(MP.getBase()); + return true; +} + +inline bool GetMemberPtrDecl(InterpState &S, CodePtr OpPC) { + const auto &MP = S.Stk.pop<MemberPointer>(); + + const auto *FD = cast<FunctionDecl>(MP.getDecl()); + const auto *Func = S.getContext().getOrCreateFunction(FD); + + S.Stk.push<FunctionPointer>(Func); + return true; +} + /// Just emit a diagnostic. The expression that caused emission of this /// op is not valid in a constant context. inline bool Invalid(InterpState &S, CodePtr OpPC) { diff --git a/clang/lib/AST/Interp/InterpFrame.cpp b/clang/lib/AST/Interp/InterpFrame.cpp index 51b0bd5..54ccf90 100644 --- a/clang/lib/AST/Interp/InterpFrame.cpp +++ b/clang/lib/AST/Interp/InterpFrame.cpp @@ -12,6 +12,7 @@ #include "Function.h" #include "InterpStack.h" #include "InterpState.h" +#include "MemberPointer.h" #include "Pointer.h" #include "PrimType.h" #include "Program.h" diff --git a/clang/lib/AST/Interp/InterpStack.cpp b/clang/lib/AST/Interp/InterpStack.cpp index 91fe40f..c702474 100644 --- a/clang/lib/AST/Interp/InterpStack.cpp +++ b/clang/lib/AST/Interp/InterpStack.cpp @@ -10,6 +10,7 @@ #include "Boolean.h" #include "Floating.h" #include "Integral.h" +#include "MemberPointer.h" #include "Pointer.h" #include <cassert> #include <cstdlib> diff --git a/clang/lib/AST/Interp/InterpStack.h b/clang/lib/AST/Interp/InterpStack.h index 3fd0f63..9d85503 100644 --- a/clang/lib/AST/Interp/InterpStack.h +++ b/clang/lib/AST/Interp/InterpStack.h @@ -15,6 +15,7 @@ #include "FunctionPointer.h" #include "IntegralAP.h" +#include "MemberPointer.h" #include "PrimType.h" #include <memory> #include <vector> @@ -188,6 +189,8 @@ private: return PT_IntAP; else if constexpr (std::is_same_v<T, IntegralAP<false>>) return PT_IntAP; + else if constexpr (std::is_same_v<T, MemberPointer>) + return PT_MemberPtr; llvm_unreachable("unknown type push()'ed into InterpStack"); } diff --git a/clang/lib/AST/Interp/MemberPointer.cpp b/clang/lib/AST/Interp/MemberPointer.cpp new file mode 100644 index 0000000..96f6364 --- /dev/null +++ b/clang/lib/AST/Interp/MemberPointer.cpp @@ -0,0 +1,76 @@ +//===------------------------- MemberPointer.cpp ----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "MemberPointer.h" +#include "Context.h" +#include "FunctionPointer.h" +#include "Program.h" +#include "Record.h" + +namespace clang { +namespace interp { + +std::optional<Pointer> MemberPointer::toPointer(const Context &Ctx) const { + if (!Dcl || isa<FunctionDecl>(Dcl)) + return Base; + const FieldDecl *FD = cast<FieldDecl>(Dcl); + assert(FD); + + if (!Base.isBlockPointer()) + return std::nullopt; + + Pointer CastedBase = + (PtrOffset < 0 ? Base.atField(-PtrOffset) : Base.atFieldSub(PtrOffset)); + + const Record *BaseRecord = CastedBase.getRecord(); + if (!BaseRecord) + return std::nullopt; + + assert(BaseRecord); + if (FD->getParent() == BaseRecord->getDecl()) + return CastedBase.atField(BaseRecord->getField(FD)->Offset); + + const RecordDecl *FieldParent = FD->getParent(); + const Record *FieldRecord = Ctx.getRecord(FieldParent); + + unsigned Offset = 0; + Offset += FieldRecord->getField(FD)->Offset; + Offset += CastedBase.block()->getDescriptor()->getMetadataSize(); + + if (Offset > CastedBase.block()->getSize()) + return std::nullopt; + + if (const RecordDecl *BaseDecl = Base.getDeclPtr().getRecord()->getDecl(); + BaseDecl != FieldParent) + Offset += Ctx.collectBaseOffset(FieldParent, BaseDecl); + + if (Offset > CastedBase.block()->getSize()) + return std::nullopt; + + assert(Offset <= CastedBase.block()->getSize()); + return Pointer(const_cast<Block *>(Base.block()), Offset, Offset); +} + +FunctionPointer MemberPointer::toFunctionPointer(const Context &Ctx) const { + return FunctionPointer(Ctx.getProgram().getFunction(cast<FunctionDecl>(Dcl))); +} + +APValue MemberPointer::toAPValue() const { + if (isZero()) + return APValue(static_cast<ValueDecl *>(nullptr), /*IsDerivedMember=*/false, + /*Path=*/{}); + + if (hasBase()) + return Base.toAPValue(); + + return APValue(cast<ValueDecl>(getDecl()), /*IsDerivedMember=*/false, + /*Path=*/{}); +} + +} // namespace interp +} // namespace clang diff --git a/clang/lib/AST/Interp/MemberPointer.h b/clang/lib/AST/Interp/MemberPointer.h new file mode 100644 index 0000000..5c61f6a --- /dev/null +++ b/clang/lib/AST/Interp/MemberPointer.h @@ -0,0 +1,112 @@ +//===------------------------- MemberPointer.h ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_INTERP_MEMBER_POINTER_H +#define LLVM_CLANG_AST_INTERP_MEMBER_POINTER_H + +#include "Pointer.h" +#include <optional> + +namespace clang { +class ASTContext; +namespace interp { + +class Context; +class FunctionPointer; + +class MemberPointer final { +private: + Pointer Base; + const Decl *Dcl = nullptr; + int32_t PtrOffset = 0; + + MemberPointer(Pointer Base, const Decl *Dcl, int32_t PtrOffset) + : Base(Base), Dcl(Dcl), PtrOffset(PtrOffset) {} + +public: + MemberPointer() = default; + MemberPointer(Pointer Base, const Decl *Dcl) : Base(Base), Dcl(Dcl) {} + MemberPointer(uint32_t Address, const Descriptor *D) { + // We only reach this for Address == 0, when creating a null member pointer. + assert(Address == 0); + } + + MemberPointer(const Decl *D) : Dcl(D) { + assert((isa<FieldDecl, IndirectFieldDecl, CXXMethodDecl>(D))); + } + + uint64_t getIntegerRepresentation() const { + assert( + false && + "getIntegerRepresentation() shouldn't be reachable for MemberPointers"); + return 17; + } + + std::optional<Pointer> toPointer(const Context &Ctx) const; + + FunctionPointer toFunctionPointer(const Context &Ctx) const; + + Pointer getBase() const { + if (PtrOffset < 0) + return Base.atField(-PtrOffset); + return Base.atFieldSub(PtrOffset); + } + bool isMemberFunctionPointer() const { + return isa_and_nonnull<CXXMethodDecl>(Dcl); + } + const CXXMethodDecl *getMemberFunction() const { + return dyn_cast_if_present<CXXMethodDecl>(Dcl); + } + const FieldDecl *getField() const { + return dyn_cast_if_present<FieldDecl>(Dcl); + } + + bool hasDecl() const { return Dcl; } + const Decl *getDecl() const { return Dcl; } + + MemberPointer atInstanceBase(unsigned Offset) const { + if (Base.isZero()) + return MemberPointer(Base, Dcl, Offset); + return MemberPointer(this->Base, Dcl, Offset + PtrOffset); + } + + MemberPointer takeInstance(Pointer Instance) const { + assert(this->Base.isZero()); + return MemberPointer(Instance, this->Dcl, this->PtrOffset); + } + + APValue toAPValue() const; + + bool isZero() const { return Base.isZero() && !Dcl; } + bool hasBase() const { return !Base.isZero(); } + + void print(llvm::raw_ostream &OS) const { + OS << "MemberPtr(" << Base << " " << (void *)Dcl << " + " << PtrOffset + << ")"; + } + + std::string toDiagnosticString(const ASTContext &Ctx) const { + return "FIXME"; + } + + ComparisonCategoryResult compare(const MemberPointer &RHS) const { + if (this->Dcl == RHS.Dcl) + return ComparisonCategoryResult::Equal; + return ComparisonCategoryResult::Unordered; + } +}; + +inline llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, MemberPointer FP) { + FP.print(OS); + return OS; +} + +} // namespace interp +} // namespace clang + +#endif diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td index cfbd7f93..cb4f299 100644 --- a/clang/lib/AST/Interp/Opcodes.td +++ b/clang/lib/AST/Interp/Opcodes.td @@ -30,6 +30,7 @@ def IntAPS : Type; def Float : Type; def Ptr : Type; def FnPtr : Type; +def MemberPtr : Type; //===----------------------------------------------------------------------===// // Types transferred to the interpreter. @@ -61,6 +62,7 @@ def ArgOffsetOfExpr : ArgType { let Name = "const OffsetOfExpr *"; } def ArgDeclRef : ArgType { let Name = "const DeclRefExpr *"; } def ArgDesc : ArgType { let Name = "const Descriptor *"; } def ArgCCI : ArgType { let Name = "const ComparisonCategoryInfo *"; } +def ArgDecl : ArgType { let Name = "const Decl*"; } //===----------------------------------------------------------------------===// // Classes of types instructions operate on. @@ -93,7 +95,7 @@ def AluTypeClass : TypeClass { } def PtrTypeClass : TypeClass { - let Types = [Ptr, FnPtr]; + let Types = [Ptr, FnPtr, MemberPtr]; } def BoolTypeClass : TypeClass { @@ -208,7 +210,6 @@ def CallBI : Opcode { def CallPtr : Opcode { let Args = [ArgUint32, ArgCallExpr]; - let Types = []; } def CallVar : Opcode { @@ -327,6 +328,11 @@ def GetPtrBasePop : Opcode { // Offset of field, which is a base. let Args = [ArgUint32]; } +def GetMemberPtrBasePop : Opcode { + // Offset of field, which is a base. + let Args = [ArgSint32]; +} + def FinishInitPop : Opcode; def FinishInit : Opcode; @@ -751,6 +757,14 @@ def CheckNonNullArg : Opcode { def Memcpy : Opcode; +def ToMemberPtr : Opcode; +def CastMemberPtrPtr : Opcode; +def GetMemberPtr : Opcode { + let Args = [ArgDecl]; +} +def GetMemberPtrBase : Opcode; +def GetMemberPtrDecl : Opcode; + //===----------------------------------------------------------------------===// // Debugging. //===----------------------------------------------------------------------===// diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp index 252f7ea..a60b4d2 100644 --- a/clang/lib/AST/Interp/Pointer.cpp +++ b/clang/lib/AST/Interp/Pointer.cpp @@ -13,6 +13,7 @@ #include "Function.h" #include "Integral.h" #include "InterpBlock.h" +#include "MemberPointer.h" #include "PrimType.h" #include "Record.h" diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h index 93ca754..c6e4f4d 100644 --- a/clang/lib/AST/Interp/Pointer.h +++ b/clang/lib/AST/Interp/Pointer.h @@ -620,6 +620,7 @@ public: private: friend class Block; friend class DeadBlock; + friend class MemberPointer; friend struct InitMap; Pointer(Block *Pointee, unsigned Base, uint64_t Offset); diff --git a/clang/lib/AST/Interp/PrimType.cpp b/clang/lib/AST/Interp/PrimType.cpp index 9b96dcf..3054e67 100644 --- a/clang/lib/AST/Interp/PrimType.cpp +++ b/clang/lib/AST/Interp/PrimType.cpp @@ -11,6 +11,7 @@ #include "Floating.h" #include "FunctionPointer.h" #include "IntegralAP.h" +#include "MemberPointer.h" #include "Pointer.h" using namespace clang; diff --git a/clang/lib/AST/Interp/PrimType.h b/clang/lib/AST/Interp/PrimType.h index 604fb5d..20fb5e8 100644 --- a/clang/lib/AST/Interp/PrimType.h +++ b/clang/lib/AST/Interp/PrimType.h @@ -25,6 +25,7 @@ class Pointer; class Boolean; class Floating; class FunctionPointer; +class MemberPointer; template <bool Signed> class IntegralAP; template <unsigned Bits, bool Signed> class Integral; @@ -44,10 +45,11 @@ enum PrimType : unsigned { PT_Float = 11, PT_Ptr = 12, PT_FnPtr = 13, + PT_MemberPtr = 14, }; inline constexpr bool isPtrType(PrimType T) { - return T == PT_Ptr || T == PT_FnPtr; + return T == PT_Ptr || T == PT_FnPtr || T == PT_MemberPtr; } enum class CastKind : uint8_t { @@ -91,6 +93,9 @@ template <> struct PrimConv<PT_Ptr> { using T = Pointer; }; template <> struct PrimConv<PT_FnPtr> { using T = FunctionPointer; }; +template <> struct PrimConv<PT_MemberPtr> { + using T = MemberPointer; +}; /// Returns the size of a primitive type in bytes. size_t primSize(PrimType Type); @@ -131,6 +136,7 @@ static inline bool aligned(const void *P) { TYPE_SWITCH_CASE(PT_Bool, B) \ TYPE_SWITCH_CASE(PT_Ptr, B) \ TYPE_SWITCH_CASE(PT_FnPtr, B) \ + TYPE_SWITCH_CASE(PT_MemberPtr, B) \ } \ } while (0) diff --git a/clang/lib/AST/OpenACCClause.cpp b/clang/lib/AST/OpenACCClause.cpp index 403ce9a..95089a9 100644 --- a/clang/lib/AST/OpenACCClause.cpp +++ b/clang/lib/AST/OpenACCClause.cpp @@ -104,7 +104,7 @@ OpenACCClause::child_range OpenACCClause::children() { #define VISIT_CLAUSE(CLAUSE_NAME) \ case OpenACCClauseKind::CLAUSE_NAME: \ return cast<OpenACC##CLAUSE_NAME##Clause>(this)->children(); -#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME) \ +#define CLAUSE_ALIAS(ALIAS_NAME, CLAUSE_NAME, DEPRECATED) \ case OpenACCClauseKind::ALIAS_NAME: \ return cast<OpenACC##CLAUSE_NAME##Clause>(this)->children(); diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp index 534793b..3d6a1cc 100644 --- a/clang/lib/AST/ParentMap.cpp +++ b/clang/lib/AST/ParentMap.cpp @@ -97,22 +97,6 @@ static void BuildParentMap(MapTy& M, Stmt* S, BuildParentMap(M, SubStmt, OVMode); } break; - case Stmt::CXXDefaultArgExprClass: - if (auto *Arg = dyn_cast<CXXDefaultArgExpr>(S)) { - if (Arg->hasRewrittenInit()) { - M[Arg->getExpr()] = S; - BuildParentMap(M, Arg->getExpr(), OVMode); - } - } - break; - case Stmt::CXXDefaultInitExprClass: - if (auto *Init = dyn_cast<CXXDefaultInitExpr>(S)) { - if (Init->hasRewrittenInit()) { - M[Init->getExpr()] = S; - BuildParentMap(M, Init->getExpr(), OVMode); - } - } - break; default: for (Stmt *SubStmt : S->children()) { if (SubStmt) { diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 8baccee..1076dcd 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -958,6 +958,9 @@ void TextNodeDumper::dumpTemplateArgument(const TemplateArgument &TA) { } OS << " '" << Str << "'"; + if (!Context) + return; + if (TemplateArgument CanonTA = Context->getCanonicalTemplateArgument(TA); !CanonTA.structurallyEquals(TA)) { llvm::SmallString<128> CanonStr; @@ -1139,15 +1142,17 @@ void TextNodeDumper::dumpTemplateName(TemplateName TN, StringRef Label) { } OS << " '" << Str << "'"; - if (TemplateName CanonTN = Context->getCanonicalTemplateName(TN); - CanonTN != TN) { - llvm::SmallString<128> CanonStr; - { - llvm::raw_svector_ostream SS(CanonStr); - CanonTN.print(SS, PrintPolicy); + if (Context) { + if (TemplateName CanonTN = Context->getCanonicalTemplateName(TN); + CanonTN != TN) { + llvm::SmallString<128> CanonStr; + { + llvm::raw_svector_ostream SS(CanonStr); + CanonTN.print(SS, PrintPolicy); + } + if (CanonStr != Str) + OS << ":'" << CanonStr << "'"; } - if (CanonStr != Str) - OS << ":'" << CanonStr << "'"; } } dumpBareTemplateName(TN); diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 2097b29..33acae2 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2749,6 +2749,43 @@ bool QualType::isTriviallyCopyableType(const ASTContext &Context) const { /*IsCopyConstructible=*/false); } +// FIXME: each call will trigger a full computation, cache the result. +bool QualType::isBitwiseCloneableType(const ASTContext &Context) const { + auto CanonicalType = getCanonicalType(); + if (CanonicalType.hasNonTrivialObjCLifetime()) + return false; + if (CanonicalType->isArrayType()) + return Context.getBaseElementType(CanonicalType) + .isBitwiseCloneableType(Context); + + if (CanonicalType->isIncompleteType()) + return false; + const auto *RD = CanonicalType->getAsRecordDecl(); // struct/union/class + if (!RD) + return true; + + // Never allow memcpy when we're adding poisoned padding bits to the struct. + // Accessing these posioned bits will trigger false alarms on + // SanitizeAddressFieldPadding etc. + if (RD->mayInsertExtraPadding()) + return false; + + for (auto *const Field : RD->fields()) { + if (!Field->getType().isBitwiseCloneableType(Context)) + return false; + } + + if (const auto *CXXRD = dyn_cast<CXXRecordDecl>(RD)) { + for (auto Base : CXXRD->bases()) + if (!Base.getType().isBitwiseCloneableType(Context)) + return false; + for (auto VBase : CXXRD->vbases()) + if (!VBase.getType().isBitwiseCloneableType(Context)) + return false; + } + return true; +} + bool QualType::isTriviallyCopyConstructibleType( const ASTContext &Context) const { return isTriviallyCopyableTypeImpl(*this, Context, @@ -4444,7 +4481,6 @@ static CachedProperties computeCachedProperties(const Type *T) { #define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class,Base) case Type::Class: #include "clang/AST/TypeNodes.inc" // Treat instantiation-dependent types as external. - if (!T->isInstantiationDependentType()) T->dump(); assert(T->isInstantiationDependentType()); return CachedProperties(Linkage::External, false); diff --git a/clang/lib/Analysis/CFG.cpp b/clang/lib/Analysis/CFG.cpp index 0231725..64e6155 100644 --- a/clang/lib/Analysis/CFG.cpp +++ b/clang/lib/Analysis/CFG.cpp @@ -556,10 +556,6 @@ public: private: // Visitors to walk an AST and construct the CFG. - CFGBlock *VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Default, - AddStmtChoice asc); - CFGBlock *VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Default, - AddStmtChoice asc); CFGBlock *VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc); CFGBlock *VisitAddrLabelExpr(AddrLabelExpr *A, AddStmtChoice asc); CFGBlock *VisitAttributedStmt(AttributedStmt *A, AddStmtChoice asc); @@ -2258,10 +2254,16 @@ CFGBlock *CFGBuilder::Visit(Stmt * S, AddStmtChoice asc, asc, ExternallyDestructed); case Stmt::CXXDefaultArgExprClass: - return VisitCXXDefaultArgExpr(cast<CXXDefaultArgExpr>(S), asc); - case Stmt::CXXDefaultInitExprClass: - return VisitCXXDefaultInitExpr(cast<CXXDefaultInitExpr>(S), asc); + // FIXME: The expression inside a CXXDefaultArgExpr is owned by the + // called function's declaration, not by the caller. If we simply add + // this expression to the CFG, we could end up with the same Expr + // appearing multiple times (PR13385). + // + // It's likewise possible for multiple CXXDefaultInitExprs for the same + // expression to be used in the same function (through aggregate + // initialization). + return VisitStmt(S, asc); case Stmt::CXXBindTemporaryExprClass: return VisitCXXBindTemporaryExpr(cast<CXXBindTemporaryExpr>(S), asc); @@ -2431,40 +2433,6 @@ CFGBlock *CFGBuilder::VisitChildren(Stmt *S) { return B; } -CFGBlock *CFGBuilder::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *Arg, - AddStmtChoice asc) { - if (Arg->hasRewrittenInit()) { - if (asc.alwaysAdd(*this, Arg)) { - autoCreateBlock(); - appendStmt(Block, Arg); - } - return VisitStmt(Arg->getExpr(), asc); - } - - // We can't add the default argument if it's not rewritten because the - // expression inside a CXXDefaultArgExpr is owned by the called function's - // declaration, not by the caller, we could end up with the same expression - // appearing multiple times. - return VisitStmt(Arg, asc); -} - -CFGBlock *CFGBuilder::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *Init, - AddStmtChoice asc) { - if (Init->hasRewrittenInit()) { - if (asc.alwaysAdd(*this, Init)) { - autoCreateBlock(); - appendStmt(Block, Init); - } - return VisitStmt(Init->getExpr(), asc); - } - - // We can't add the default initializer if it's not rewritten because multiple - // CXXDefaultInitExprs for the same sub-expression to be used in the same - // function (through aggregate initialization). we could end up with the same - // expression appearing multiple times. - return VisitStmt(Init, asc); -} - CFGBlock *CFGBuilder::VisitInitListExpr(InitListExpr *ILE, AddStmtChoice asc) { if (asc.alwaysAdd(*this, ILE)) { autoCreateBlock(); diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index e2609b9..1d96a92 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -144,6 +144,7 @@ static const CudaArchToStringMap arch_names[] = { GFX(1103), // gfx1103 GFX(1150), // gfx1150 GFX(1151), // gfx1151 + GFX(1152), // gfx1152 {CudaArch::GFX12_GENERIC, "gfx12-generic", "compute_amdgcn"}, GFX(1200), // gfx1200 GFX(1201), // gfx1201 diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h index 6857284..5fc2234 100644 --- a/clang/lib/Basic/Targets/LoongArch.h +++ b/clang/lib/Basic/Targets/LoongArch.h @@ -133,7 +133,7 @@ public: LongWidth = LongAlign = PointerWidth = PointerAlign = 64; IntMaxType = Int64Type = SignedLong; HasUnalignedAccess = true; - resetDataLayout("e-m:e-p:64:64-i64:64-i128:128-n64-S128"); + resetDataLayout("e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); // TODO: select appropriate ABI. setABI("lp64d"); } diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index fc6ef11..ff7d2f1 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -228,6 +228,7 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1152: case CudaArch::GFX12_GENERIC: case CudaArch::GFX1200: case CudaArch::GFX1201: diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index 191bd75..6e9a1ba 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -3537,6 +3537,7 @@ void CGOpenMPRuntimeGPU::processRequiresDirective( case CudaArch::GFX1103: case CudaArch::GFX1150: case CudaArch::GFX1151: + case CudaArch::GFX1152: case CudaArch::GFX12_GENERIC: case CudaArch::GFX1200: case CudaArch::GFX1201: diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index d1ff8b4..057f6ef 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -120,7 +120,11 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const { Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr, QualType Ty) const { - llvm_unreachable("AMDGPU does not support varargs"); + const bool IsIndirect = false; + const bool AllowHigherAlign = false; + return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect, + getContext().getTypeInfoInChars(Ty), + CharUnits::fromQuantity(4), AllowHigherAlign); } ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const { diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp index 08e711ca..6e56ee5 100644 --- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp +++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp @@ -31,7 +31,6 @@ using namespace clang; using namespace clang::extractapi; using namespace llvm; -using namespace llvm::json; namespace { @@ -1036,9 +1035,9 @@ void SymbolGraphSerializer::serializeGraphToStream( ExtendedModule &&EM) { Object Root = serializeGraph(ModuleName, std::move(EM)); if (Options.Compact) - OS << formatv("{0}", Value(std::move(Root))) << "\n"; + OS << formatv("{0}", json::Value(std::move(Root))) << "\n"; else - OS << formatv("{0:2}", Value(std::move(Root))) << "\n"; + OS << formatv("{0:2}", json::Value(std::move(Root))) << "\n"; } void SymbolGraphSerializer::serializeMainSymbolGraph( diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp index d6061c2..eb96b54 100644 --- a/clang/lib/Format/UnwrappedLineParser.cpp +++ b/clang/lib/Format/UnwrappedLineParser.cpp @@ -1181,10 +1181,10 @@ void UnwrappedLineParser::parsePPDefine() { Line->InMacroBody = true; if (Style.SkipMacroDefinitionBody) { - do { + while (!eof()) { FormatTok->Finalized = true; - nextToken(); - } while (!eof()); + FormatTok = Tokens->getNextToken(); + } addUnwrappedLine(); return; } diff --git a/clang/lib/Frontend/FrontendActions.cpp b/clang/lib/Frontend/FrontendActions.cpp index 1812b85..4f06432 100644 --- a/clang/lib/Frontend/FrontendActions.cpp +++ b/clang/lib/Frontend/FrontendActions.cpp @@ -1169,8 +1169,8 @@ void PrintDependencyDirectivesSourceMinimizerAction::ExecuteAction() { llvm::SmallVector<dependency_directives_scan::Token, 16> Tokens; llvm::SmallVector<dependency_directives_scan::Directive, 32> Directives; if (scanSourceForDependencyDirectives( - FromFile.getBuffer(), Tokens, Directives, CI.getLangOpts(), - &CI.getDiagnostics(), SM.getLocForStartOfFile(SM.getMainFileID()))) { + FromFile.getBuffer(), Tokens, Directives, &CI.getDiagnostics(), + SM.getLocForStartOfFile(SM.getMainFileID()))) { assert(CI.getDiagnostics().hasErrorOccurred() && "no errors reported for failure"); diff --git a/clang/lib/Interpreter/IncrementalParser.cpp b/clang/lib/Interpreter/IncrementalParser.cpp index 5bc8385..a8d0294 100644 --- a/clang/lib/Interpreter/IncrementalParser.cpp +++ b/clang/lib/Interpreter/IncrementalParser.cpp @@ -413,7 +413,8 @@ void IncrementalParser::CleanUpPTU(PartialTranslationUnit &PTU) { if (!ND) continue; // Check if we need to clean up the IdResolver chain. - if (ND->getDeclName().getFETokenInfo()) + if (ND->getDeclName().getFETokenInfo() && !D->getLangOpts().ObjC && + !D->getLangOpts().CPlusPlus) getCI()->getSema().IdResolver.RemoveDecl(ND); } } diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp index 683f87e..7a95278 100644 --- a/clang/lib/Interpreter/Interpreter.cpp +++ b/clang/lib/Interpreter/Interpreter.cpp @@ -42,6 +42,9 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/Host.h" + +#include <cstdarg> + using namespace clang; // FIXME: Figure out how to unify with namespace init_convenience from @@ -270,14 +273,10 @@ Interpreter::~Interpreter() { // can't find the precise resource directory in unittests so we have to hard // code them. const char *const Runtimes = R"( + #define __CLANG_REPL__ 1 #ifdef __cplusplus + #define EXTERN_C extern "C" void *__clang_Interpreter_SetValueWithAlloc(void*, void*, void*); - void __clang_Interpreter_SetValueNoAlloc(void*, void*, void*); - void __clang_Interpreter_SetValueNoAlloc(void*, void*, void*, void*); - void __clang_Interpreter_SetValueNoAlloc(void*, void*, void*, float); - void __clang_Interpreter_SetValueNoAlloc(void*, void*, void*, double); - void __clang_Interpreter_SetValueNoAlloc(void*, void*, void*, long double); - void __clang_Interpreter_SetValueNoAlloc(void*,void*,void*,unsigned long long); struct __clang_Interpreter_NewTag{} __ci_newtag; void* operator new(__SIZE_TYPE__, void* __p, __clang_Interpreter_NewTag) noexcept; template <class T, class = T (*)() /*disable for arrays*/> @@ -289,7 +288,11 @@ const char *const Runtimes = R"( void __clang_Interpreter_SetValueCopyArr(const T (*Src)[N], void* Placement, unsigned long Size) { __clang_Interpreter_SetValueCopyArr(Src[0], Placement, Size); } +#else + #define EXTERN_C extern #endif // __cplusplus + + EXTERN_C void __clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, ...); )"; llvm::Expected<std::unique_ptr<Interpreter>> @@ -588,15 +591,17 @@ std::unique_ptr<RuntimeInterfaceBuilder> Interpreter::FindRuntimeInterface() { if (!LookupInterface(ValuePrintingInfo[NoAlloc], MagicRuntimeInterface[NoAlloc])) return nullptr; - if (!LookupInterface(ValuePrintingInfo[WithAlloc], - MagicRuntimeInterface[WithAlloc])) - return nullptr; - if (!LookupInterface(ValuePrintingInfo[CopyArray], - MagicRuntimeInterface[CopyArray])) - return nullptr; - if (!LookupInterface(ValuePrintingInfo[NewTag], - MagicRuntimeInterface[NewTag])) - return nullptr; + if (Ctx.getLangOpts().CPlusPlus) { + if (!LookupInterface(ValuePrintingInfo[WithAlloc], + MagicRuntimeInterface[WithAlloc])) + return nullptr; + if (!LookupInterface(ValuePrintingInfo[CopyArray], + MagicRuntimeInterface[CopyArray])) + return nullptr; + if (!LookupInterface(ValuePrintingInfo[NewTag], + MagicRuntimeInterface[NewTag])) + return nullptr; + } return createInProcessRuntimeInterfaceBuilder(*this, Ctx, S); } @@ -855,69 +860,81 @@ __clang_Interpreter_SetValueWithAlloc(void *This, void *OutVal, return VRef.getPtr(); } -// Pointers, lvalue struct that can take as a reference. -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, - void *Val) { +extern "C" void REPL_EXTERNAL_VISIBILITY __clang_Interpreter_SetValueNoAlloc( + void *This, void *OutVal, void *OpaqueType, ...) { Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); - VRef.setPtr(Val); -} + Interpreter *I = static_cast<Interpreter *>(This); + VRef = Value(I, OpaqueType); + if (VRef.isVoid()) + return; -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, - void *OpaqueType) { - Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); -} + va_list args; + va_start(args, /*last named param*/ OpaqueType); -static void SetValueDataBasedOnQualType(Value &V, unsigned long long Data) { - QualType QT = V.getType(); - if (const auto *ET = QT->getAs<EnumType>()) - QT = ET->getDecl()->getIntegerType(); - - switch (QT->castAs<BuiltinType>()->getKind()) { - default: - llvm_unreachable("unknown type kind!"); -#define X(type, name) \ - case BuiltinType::name: \ - V.set##name(Data); \ - break; - REPL_BUILTIN_TYPES -#undef X + QualType QT = VRef.getType(); + if (VRef.getKind() == Value::K_PtrOrObj) { + VRef.setPtr(va_arg(args, void *)); + } else { + if (const auto *ET = QT->getAs<EnumType>()) + QT = ET->getDecl()->getIntegerType(); + switch (QT->castAs<BuiltinType>()->getKind()) { + default: + llvm_unreachable("unknown type kind!"); + break; + // Types shorter than int are resolved as int, else va_arg has UB. + case BuiltinType::Bool: + VRef.setBool(va_arg(args, int)); + break; + case BuiltinType::Char_S: + VRef.setChar_S(va_arg(args, int)); + break; + case BuiltinType::SChar: + VRef.setSChar(va_arg(args, int)); + break; + case BuiltinType::Char_U: + VRef.setChar_U(va_arg(args, unsigned)); + break; + case BuiltinType::UChar: + VRef.setUChar(va_arg(args, unsigned)); + break; + case BuiltinType::Short: + VRef.setShort(va_arg(args, int)); + break; + case BuiltinType::UShort: + VRef.setUShort(va_arg(args, unsigned)); + break; + case BuiltinType::Int: + VRef.setInt(va_arg(args, int)); + break; + case BuiltinType::UInt: + VRef.setUInt(va_arg(args, unsigned)); + break; + case BuiltinType::Long: + VRef.setLong(va_arg(args, long)); + break; + case BuiltinType::ULong: + VRef.setULong(va_arg(args, unsigned long)); + break; + case BuiltinType::LongLong: + VRef.setLongLong(va_arg(args, long long)); + break; + case BuiltinType::ULongLong: + VRef.setULongLong(va_arg(args, unsigned long long)); + break; + // Types shorter than double are resolved as double, else va_arg has UB. + case BuiltinType::Float: + VRef.setFloat(va_arg(args, double)); + break; + case BuiltinType::Double: + VRef.setDouble(va_arg(args, double)); + break; + case BuiltinType::LongDouble: + VRef.setLongDouble(va_arg(args, long double)); + break; + // See REPL_BUILTIN_TYPES. + } } -} - -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, - unsigned long long Val) { - Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); - SetValueDataBasedOnQualType(VRef, Val); -} - -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, - float Val) { - Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); - VRef.setFloat(Val); -} - -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, - double Val) { - Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); - VRef.setDouble(Val); -} - -REPL_EXTERNAL_VISIBILITY void -__clang_Interpreter_SetValueNoAlloc(void *This, void *OutVal, void *OpaqueType, - long double Val) { - Value &VRef = *(Value *)OutVal; - VRef = Value(static_cast<Interpreter *>(This), OpaqueType); - VRef.setLongDouble(Val); + va_end(args); } // A trampoline to work around the fact that operator placement new cannot diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp index fda54d3..0971daa 100644 --- a/clang/lib/Lex/DependencyDirectivesScanner.cpp +++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp @@ -62,17 +62,14 @@ struct DirectiveWithTokens { struct Scanner { Scanner(StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, - DiagnosticsEngine *Diags, SourceLocation InputSourceLoc, - const LangOptions &LangOpts) + DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) : Input(Input), Tokens(Tokens), Diags(Diags), - InputSourceLoc(InputSourceLoc), - LangOpts(getLangOptsForDepScanning(LangOpts)), - TheLexer(InputSourceLoc, this->LangOpts, Input.begin(), Input.begin(), + InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()), + TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(), Input.end()) {} - static LangOptions - getLangOptsForDepScanning(const LangOptions &invocationLangOpts) { - LangOptions LangOpts(invocationLangOpts); + static LangOptions getLangOptsForDepScanning() { + LangOptions LangOpts; // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'. LangOpts.ObjC = true; LangOpts.LineComment = true; @@ -703,7 +700,7 @@ bool Scanner::lex_Pragma(const char *&First, const char *const End) { SmallVector<dependency_directives_scan::Token> DiscardTokens; const char *Begin = Buffer.c_str(); Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags, - InputSourceLoc, LangOptions()}; + InputSourceLoc}; PragmaScanner.TheLexer.setParsingPreprocessorDirective(true); if (PragmaScanner.lexPragma(Begin, Buffer.end())) @@ -953,10 +950,9 @@ bool Scanner::scan(SmallVectorImpl<Directive> &Directives) { bool clang::scanSourceForDependencyDirectives( StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, - SmallVectorImpl<Directive> &Directives, const LangOptions &LangOpts, - DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) { - return Scanner(Input, Tokens, Diags, InputSourceLoc, LangOpts) - .scan(Directives); + SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags, + SourceLocation InputSourceLoc) { + return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives); } void clang::printDependencyDirectivesAsSource( diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index c252032..16a5b74 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -571,11 +571,8 @@ StmtResult Parser::ParseExprStatement(ParsedStmtContext StmtCtx) { } Token *CurTok = nullptr; - // If the semicolon is missing at the end of REPL input, consider if - // we want to do value printing. Note this is only enabled in C++ mode - // since part of the implementation requires C++ language features. // Note we shouldn't eat the token since the callback needs it. - if (Tok.is(tok::annot_repl_input_end) && Actions.getLangOpts().CPlusPlus) + if (Tok.is(tok::annot_repl_input_end)) CurTok = &Tok; else // Otherwise, eat the semicolon. diff --git a/clang/lib/Sema/Scope.cpp b/clang/lib/Sema/Scope.cpp index c08073e..5bc7e79 100644 --- a/clang/lib/Sema/Scope.cpp +++ b/clang/lib/Sema/Scope.cpp @@ -228,7 +228,11 @@ void Scope::dumpImpl(raw_ostream &OS) const { {CompoundStmtScope, "CompoundStmtScope"}, {ClassInheritanceScope, "ClassInheritanceScope"}, {CatchScope, "CatchScope"}, + {ConditionVarScope, "ConditionVarScope"}, + {OpenMPOrderClauseScope, "OpenMPOrderClauseScope"}, + {LambdaScope, "LambdaScope"}, {OpenACCComputeConstructScope, "OpenACCComputeConstructScope"}, + {TypeAliasScope, "TypeAliasScope"}, {FriendScope, "FriendScope"}, }; diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index c446cc1..d11bc9e 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -31,9 +31,9 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, constexpr const int SizeIdx = 2; llvm::APSInt Size; Expr *ArgExpr = TheCall->getArg(SizeIdx); - ExprResult R = SemaRef.VerifyIntegerConstantExpression(ArgExpr, &Size); - if (R.isInvalid()) - return true; + [[maybe_unused]] ExprResult R = + SemaRef.VerifyIntegerConstantExpression(ArgExpr, &Size); + assert(!R.isInvalid()); switch (Size.getSExtValue()) { case 1: case 2: diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp index a6734ef..4b9b735 100644 --- a/clang/lib/Sema/SemaDecl.cpp +++ b/clang/lib/Sema/SemaDecl.cpp @@ -2288,7 +2288,8 @@ void Sema::ActOnPopScope(SourceLocation Loc, Scope *S) { // Partial translation units that are created in incremental processing must // not clean up the IdResolver because PTUs should take into account the // declarations that came from previous PTUs. - if (!PP.isIncrementalProcessingEnabled() || getLangOpts().ObjC) + if (!PP.isIncrementalProcessingEnabled() || getLangOpts().ObjC || + getLangOpts().CPlusPlus) IdResolver.RemoveDecl(D); // Warn on it if we are shadowing a declaration. diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index fb5ca19..76145f2 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -5572,9 +5572,10 @@ ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc, Res = Immediate.TransformInitializer(Param->getInit(), /*NotCopy=*/false); }); - if (Res.isUsable()) - Res = ConvertParamDefaultArgument(Param, Res.get(), - Res.get()->getBeginLoc()); + if (Res.isInvalid()) + return ExprError(); + Res = ConvertParamDefaultArgument(Param, Res.get(), + Res.get()->getBeginLoc()); if (Res.isInvalid()) return ExprError(); Init = Res.get(); @@ -5608,10 +5609,9 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { InitializationContext.emplace(Loc, Field, CurContext); Expr *Init = nullptr; - bool HasRewrittenInit = false; bool NestedDefaultChecking = isCheckingDefaultArgumentOrInitializer(); - bool InLifetimeExtendingContext = isInLifetimeExtendingContext(); + EnterExpressionEvaluationContext EvalContext( *this, ExpressionEvaluationContext::PotentiallyEvaluated, Field); @@ -5646,36 +5646,19 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { ImmediateCallVisitor V(getASTContext()); if (!NestedDefaultChecking) V.TraverseDecl(Field); - - // CWG1815 - // Support lifetime extension of temporary created by aggregate - // initialization using a default member initializer. We should always rebuild - // the initializer if it contains any temporaries (if the initializer - // expression is an ExprWithCleanups). Then make sure the normal lifetime - // extension code recurses into the default initializer and does lifetime - // extension when warranted. - bool ContainsAnyTemporaries = - isa_and_present<ExprWithCleanups>(Field->getInClassInitializer()); - if (V.HasImmediateCalls || InLifetimeExtendingContext || - ContainsAnyTemporaries) { - HasRewrittenInit = true; + if (V.HasImmediateCalls) { ExprEvalContexts.back().DelayedDefaultInitializationContext = {Loc, Field, CurContext}; ExprEvalContexts.back().IsCurrentlyCheckingDefaultArgumentOrInitializer = NestedDefaultChecking; - // Pass down lifetime extending flag, and collect temporaries in - // CreateMaterializeTemporaryExpr when we rewrite the call argument. - keepInLifetimeExtendingContext(); + EnsureImmediateInvocationInDefaultArgs Immediate(*this); ExprResult Res; - - // Rebuild CXXDefaultInitExpr might cause diagnostics. - SFINAETrap Trap(*this); runWithSufficientStackSpace(Loc, [&] { Res = Immediate.TransformInitializer(Field->getInClassInitializer(), /*CXXDirectInit=*/false); }); - if (Res.isUsable()) + if (!Res.isInvalid()) Res = ConvertMemberDefaultInitExpression(Field, Res.get(), Loc); if (Res.isInvalid()) { Field->setInvalidDecl(); @@ -5702,7 +5685,7 @@ ExprResult Sema::BuildCXXDefaultInitExpr(SourceLocation Loc, FieldDecl *Field) { return CXXDefaultInitExpr::Create(Context, InitializationContext->Loc, Field, InitializationContext->Context, - HasRewrittenInit ? Init : nullptr); + Init); } // DR1351: diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 4487c61..cf461a6 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -1555,6 +1555,9 @@ Sema::BuildCXXTypeConstructExpr(TypeSourceInfo *TInfo, bool ListInitialization) { QualType Ty = TInfo->getType(); SourceLocation TyBeginLoc = TInfo->getTypeLoc().getBeginLoc(); + + assert((!ListInitialization || Exprs.size() == 1) && + "List initialization must have exactly one expression."); SourceRange FullRange = SourceRange(TyBeginLoc, RParenOrBraceLoc); InitializedEntity Entity = @@ -5126,6 +5129,7 @@ static bool CheckUnaryTypeTraitTypeCompleteness(Sema &S, TypeTrait UTT, case UTT_IsStandardLayout: case UTT_IsPOD: case UTT_IsLiteral: + case UTT_IsBitwiseCloneable: // By analogy, is_trivially_relocatable and is_trivially_equality_comparable // impose the same constraints. case UTT_IsTriviallyRelocatable: @@ -5619,6 +5623,8 @@ static bool EvaluateUnaryTypeTrait(Sema &Self, TypeTrait UTT, return C.hasUniqueObjectRepresentations(T); case UTT_IsTriviallyRelocatable: return T.isTriviallyRelocatableType(C); + case UTT_IsBitwiseCloneable: + return T.isBitwiseCloneableType(C); case UTT_IsReferenceable: return T.isReferenceable(); case UTT_CanPassInRegs: diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 9ed3e8a..ed8b226 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -8063,6 +8063,11 @@ static void visitLocalsRetainedByInitializer(IndirectLocalPath &Path, enum PathLifetimeKind { /// Lifetime-extend along this path. Extend, + /// We should lifetime-extend, but we don't because (due to technical + /// limitations) we can't. This happens for default member initializers, + /// which we don't clone for every use, so we don't have a unique + /// MaterializeTemporaryExpr to update. + ShouldExtend, /// Do not lifetime extend along this path. NoExtend }; @@ -8074,7 +8079,7 @@ shouldLifetimeExtendThroughPath(const IndirectLocalPath &Path) { PathLifetimeKind Kind = PathLifetimeKind::Extend; for (auto Elem : Path) { if (Elem.Kind == IndirectLocalPathEntry::DefaultInit) - Kind = PathLifetimeKind::Extend; + Kind = PathLifetimeKind::ShouldExtend; else if (Elem.Kind != IndirectLocalPathEntry::LambdaCaptureInit) return PathLifetimeKind::NoExtend; } @@ -8194,6 +8199,18 @@ void Sema::checkInitializerLifetime(const InitializedEntity &Entity, ExtendingEntity->allocateManglingNumber()); // Also visit the temporaries lifetime-extended by this initializer. return true; + + case PathLifetimeKind::ShouldExtend: + // We're supposed to lifetime-extend the temporary along this path (per + // the resolution of DR1815), but we don't support that yet. + // + // FIXME: Properly handle this situation. Perhaps the easiest approach + // would be to clone the initializer expression on each use that would + // lifetime extend its temporaries. + Diag(DiagLoc, diag::warn_unsupported_lifetime_extension) + << RK << DiagRange; + break; + case PathLifetimeKind::NoExtend: // If the path goes through the initialization of a variable or field, // it can't possibly reach a temporary created in this full-expression. diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp index cdb60d4..97586a0 100644 --- a/clang/lib/Sema/SemaOpenACC.cpp +++ b/clang/lib/Sema/SemaOpenACC.cpp @@ -424,615 +424,736 @@ bool checkValidAfterDeviceType( S.Diag(DeviceTypeClause.getBeginLoc(), diag::note_acc_previous_clause_here); return true; } -} // namespace -SemaOpenACC::SemaOpenACC(Sema &S) : SemaBase(S) {} +class SemaOpenACCClauseVisitor { + SemaOpenACC &SemaRef; + ASTContext &Ctx; + ArrayRef<const OpenACCClause *> ExistingClauses; + bool NotImplemented = false; -SemaOpenACC::AssociatedStmtRAII::AssociatedStmtRAII(SemaOpenACC &S, - OpenACCDirectiveKind DK) - : SemaRef(S), WasInsideComputeConstruct(S.InsideComputeConstruct), - DirKind(DK) { - // Compute constructs end up taking their 'loop'. - if (DirKind == OpenACCDirectiveKind::Parallel || - DirKind == OpenACCDirectiveKind::Serial || - DirKind == OpenACCDirectiveKind::Kernels) { - SemaRef.InsideComputeConstruct = true; - SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); + OpenACCClause *isNotImplemented() { + NotImplemented = true; + return nullptr; } -} -SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() { - SemaRef.InsideComputeConstruct = WasInsideComputeConstruct; - if (DirKind == OpenACCDirectiveKind::Parallel || - DirKind == OpenACCDirectiveKind::Serial || - DirKind == OpenACCDirectiveKind::Kernels) { - assert(SemaRef.ParentlessLoopConstructs.empty() && - "Didn't consume loop construct list?"); - SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); - } -} +public: + SemaOpenACCClauseVisitor(SemaOpenACC &S, + ArrayRef<const OpenACCClause *> ExistingClauses) + : SemaRef(S), Ctx(S.getASTContext()), ExistingClauses(ExistingClauses) {} + // Once we've implemented everything, we shouldn't need this infrastructure. + // But in the meantime, we use this to help decide whether the clause was + // handled for this directive. + bool diagNotImplemented() { return NotImplemented; } + + OpenACCClause *Visit(SemaOpenACC::OpenACCParsedClause &Clause) { + switch (Clause.getClauseKind()) { + case OpenACCClauseKind::Gang: + case OpenACCClauseKind::Worker: + case OpenACCClauseKind::Vector: { + // TODO OpenACC: These are only implemented enough for the 'seq' diagnostic, + // otherwise treats itself as unimplemented. When we implement these, we + // can remove them from here. -OpenACCClause * -SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses, - OpenACCParsedClause &Clause) { - if (Clause.getClauseKind() == OpenACCClauseKind::Invalid) - return nullptr; + // OpenACC 3.3 2.9: + // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause + // appears. + const auto *Itr = + llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSeqClause>); - // Diagnose that we don't support this clause on this directive. - if (!doesClauseApplyToDirective(Clause.getDirectiveKind(), - Clause.getClauseKind())) { - Diag(Clause.getBeginLoc(), diag::err_acc_clause_appertainment) - << Clause.getDirectiveKind() << Clause.getClauseKind(); - return nullptr; + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine) + << Clause.getClauseKind() << (*Itr)->getClauseKind(); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + } + return isNotImplemented(); } - if (const auto *DevTypeClause = - llvm::find_if(ExistingClauses, - [&](const OpenACCClause *C) { - return isa<OpenACCDeviceTypeClause>(C); - }); - DevTypeClause != ExistingClauses.end()) { - if (checkValidAfterDeviceType( - *this, *cast<OpenACCDeviceTypeClause>(*DevTypeClause), Clause)) - return nullptr; +#define VISIT_CLAUSE(CLAUSE_NAME) \ + case OpenACCClauseKind::CLAUSE_NAME: \ + return Visit##CLAUSE_NAME##Clause(Clause); +#define CLAUSE_ALIAS(ALIAS, CLAUSE_NAME, DEPRECATED) \ + case OpenACCClauseKind::ALIAS: \ + if (DEPRECATED) \ + SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) \ + << Clause.getClauseKind() << OpenACCClauseKind::CLAUSE_NAME; \ + return Visit##CLAUSE_NAME##Clause(Clause); +#include "clang/Basic/OpenACCClauses.def" + default: + return isNotImplemented(); + } + llvm_unreachable("Invalid clause kind"); } - switch (Clause.getClauseKind()) { - case OpenACCClauseKind::Default: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +#define VISIT_CLAUSE(CLAUSE_NAME) \ + OpenACCClause *Visit##CLAUSE_NAME##Clause( \ + SemaOpenACC::OpenACCParsedClause &Clause); +#include "clang/Basic/OpenACCClauses.def" +}; - // Don't add an invalid clause to the AST. - if (Clause.getDefaultClauseKind() == OpenACCDefaultClauseKind::Invalid) - return nullptr; - - // OpenACC 3.3, Section 2.5.4: - // At most one 'default' clause may appear, and it must have a value of - // either 'none' or 'present'. - // Second half of the sentence is diagnosed during parsing. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; - - return OpenACCDefaultClause::Create( - getASTContext(), Clause.getDefaultClauseKind(), Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.getEndLoc()); - } +OpenACCClause *SemaOpenACCClauseVisitor::VisitDefaultClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // Don't add an invalid clause to the AST. + if (Clause.getDefaultClauseKind() == OpenACCDefaultClauseKind::Invalid) + return nullptr; - case OpenACCClauseKind::If: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + // OpenACC 3.3, Section 2.5.4: + // At most one 'default' clause may appear, and it must have a value of + // either 'none' or 'present'. + // Second half of the sentence is diagnosed during parsing. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; + return OpenACCDefaultClause::Create( + Ctx, Clause.getDefaultClauseKind(), Clause.getBeginLoc(), + Clause.getLParenLoc(), Clause.getEndLoc()); +} - // The parser has ensured that we have a proper condition expr, so there - // isn't really much to do here. +OpenACCClause *SemaOpenACCClauseVisitor::VisitIfClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - // If the 'if' clause is true, it makes the 'self' clause have no effect, - // diagnose that here. - // TODO OpenACC: When we add these two to other constructs, we might not - // want to warn on this (for example, 'update'). - const auto *Itr = - llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>); - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } + // The parser has ensured that we have a proper condition expr, so there + // isn't really much to do here. - return OpenACCIfClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getConditionExpr(), Clause.getEndLoc()); + // If the 'if' clause is true, it makes the 'self' clause have no effect, + // diagnose that here. + // TODO OpenACC: When we add these two to other constructs, we might not + // want to warn on this (for example, 'update'). + const auto *Itr = + llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSelfClause>); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); } - case OpenACCClauseKind::Self: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; - - // TODO OpenACC: When we implement this for 'update', this takes a - // 'var-list' instead of a condition expression, so semantics/handling has - // to happen differently here. - - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; + return OpenACCIfClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getLParenLoc(), + Clause.getConditionExpr(), Clause.getEndLoc()); +} - // If the 'if' clause is true, it makes the 'self' clause have no effect, - // diagnose that here. - // TODO OpenACC: When we add these two to other constructs, we might not - // want to warn on this (for example, 'update'). - const auto *Itr = - llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>); - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } +OpenACCClause *SemaOpenACCClauseVisitor::VisitSelfClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // TODO OpenACC: When we implement this for 'update', this takes a + // 'var-list' instead of a condition expression, so semantics/handling has + // to happen differently here. + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - return OpenACCSelfClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getConditionExpr(), Clause.getEndLoc()); + // If the 'if' clause is true, it makes the 'self' clause have no effect, + // diagnose that here. + // TODO OpenACC: When we add these two to other constructs, we might not + // want to warn on this (for example, 'update'). + const auto *Itr = + llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCIfClause>); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::warn_acc_if_self_conflict); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); } - case OpenACCClauseKind::NumGangs: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + return OpenACCSelfClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.getConditionExpr(), Clause.getEndLoc()); +} - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; +OpenACCClause *SemaOpenACCClauseVisitor::VisitNumGangsClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - if (Clause.getIntExprs().empty()) - Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args) - << /*NoArgs=*/0; - - unsigned MaxArgs = - (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel || - Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) - ? 3 - : 1; - if (Clause.getIntExprs().size() > MaxArgs) - Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args) - << /*NoArgs=*/1 << Clause.getDirectiveKind() << MaxArgs + // num_gangs requires at least 1 int expr in all forms. Diagnose here, but + // allow us to continue, an empty clause might be useful for future + // diagnostics. + if (Clause.getIntExprs().empty()) + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args) + << /*NoArgs=*/0; + + unsigned MaxArgs = + (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel || + Clause.getDirectiveKind() == OpenACCDirectiveKind::ParallelLoop) + ? 3 + : 1; + // The max number of args differs between parallel and other constructs. + // Again, allow us to continue for the purposes of future diagnostics. + if (Clause.getIntExprs().size() > MaxArgs) + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_num_gangs_num_args) + << /*NoArgs=*/1 << Clause.getDirectiveKind() << MaxArgs + << Clause.getIntExprs().size(); + + // OpenACC 3.3 Section 2.5.4: + // A reduction clause may not appear on a parallel construct with a + // num_gangs clause that has more than one argument. + if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel && + Clause.getIntExprs().size() > 1) { + auto *Parallel = + llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>); + + if (Parallel != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), + diag::err_acc_reduction_num_gangs_conflict) << Clause.getIntExprs().size(); - - // OpenACC 3.3 Section 2.5.4: - // A reduction clause may not appear on a parallel construct with a - // num_gangs clause that has more than one argument. - if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel && - Clause.getIntExprs().size() > 1) { - auto *Parallel = - llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCReductionClause>); - - if (Parallel != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_reduction_num_gangs_conflict) - << Clause.getIntExprs().size(); - Diag((*Parallel)->getBeginLoc(), diag::note_acc_previous_clause_here); - return nullptr; - } + SemaRef.Diag((*Parallel)->getBeginLoc(), + diag::note_acc_previous_clause_here); + return nullptr; } - - // Create the AST node for the clause even if the number of expressions is - // incorrect. - return OpenACCNumGangsClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getIntExprs(), Clause.getEndLoc()); - break; } - case OpenACCClauseKind::NumWorkers: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + return OpenACCNumGangsClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs(), + Clause.getEndLoc()); +} - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; +OpenACCClause *SemaOpenACCClauseVisitor::VisitNumWorkersClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - assert(Clause.getIntExprs().size() == 1 && - "Invalid number of expressions for NumWorkers"); - return OpenACCNumWorkersClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getIntExprs()[0], Clause.getEndLoc()); - } - case OpenACCClauseKind::VectorLength: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + assert(Clause.getIntExprs().size() == 1 && + "Invalid number of expressions for NumWorkers"); + return OpenACCNumWorkersClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0], + Clause.getEndLoc()); +} - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; +OpenACCClause *SemaOpenACCClauseVisitor::VisitVectorLengthClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - assert(Clause.getIntExprs().size() == 1 && - "Invalid number of expressions for VectorLength"); - return OpenACCVectorLengthClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getIntExprs()[0], Clause.getEndLoc()); - } - case OpenACCClauseKind::Async: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + assert(Clause.getIntExprs().size() == 1 && + "Invalid number of expressions for NumWorkers"); + return OpenACCVectorLengthClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getIntExprs()[0], + Clause.getEndLoc()); +} - // There is no prose in the standard that says duplicates aren't allowed, - // but this diagnostic is present in other compilers, as well as makes - // sense. - if (checkAlreadyHasClauseOfKind(*this, ExistingClauses, Clause)) - return nullptr; +OpenACCClause *SemaOpenACCClauseVisitor::VisitAsyncClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // There is no prose in the standard that says duplicates aren't allowed, + // but this diagnostic is present in other compilers, as well as makes + // sense. + if (checkAlreadyHasClauseOfKind(SemaRef, ExistingClauses, Clause)) + return nullptr; - assert(Clause.getNumIntExprs() < 2 && - "Invalid number of expressions for Async"); + assert(Clause.getNumIntExprs() < 2 && + "Invalid number of expressions for Async"); + return OpenACCAsyncClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr, + Clause.getEndLoc()); +} - return OpenACCAsyncClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getNumIntExprs() != 0 ? Clause.getIntExprs()[0] : nullptr, - Clause.getEndLoc()); - } - case OpenACCClauseKind::Private: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitPrivateClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' and 'loop' + // constructs, and 'compute'/'loop' constructs are the only construct that + // can do anything with this yet, so skip/treat as unimplemented in this + // case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) && + Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) + return isNotImplemented(); + + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCPrivateClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getLParenLoc(), + Clause.getVarList(), Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitFirstPrivateClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCFirstPrivateClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(), + Clause.getEndLoc()); +} - return OpenACCPrivateClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::FirstPrivate: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitNoCreateClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCNoCreateClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getLParenLoc(), + Clause.getVarList(), Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitPresentClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCPresentClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getLParenLoc(), + Clause.getVarList(), Clause.getEndLoc()); +} - return OpenACCFirstPrivateClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::NoCreate: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCCopyClause::Create( + Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.getVarList(), Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyInClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCCopyInClause::Create( + Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.isReadOnly(), Clause.getVarList(), Clause.getEndLoc()); +} - return OpenACCNoCreateClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::Present: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitCopyOutClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCCopyOutClause::Create( + Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.isZero(), Clause.getVarList(), Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitCreateClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + // ActOnVar ensured that everything is a valid variable reference, so there + // really isn't anything to do here. GCC does some duplicate-finding, though + // it isn't apparent in the standard where this is justified. + + return OpenACCCreateClause::Create( + Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.isZero(), Clause.getVarList(), Clause.getEndLoc()); +} - return OpenACCPresentClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::PresentOrCopy: - case OpenACCClauseKind::PCopy: - Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) - << Clause.getClauseKind() << OpenACCClauseKind::Copy; - LLVM_FALLTHROUGH; - case OpenACCClauseKind::Copy: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitAttachClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // ActOnVar ensured that everything is a valid variable reference, but we + // still have to make sure it is a pointer type. + llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(), + Clause.getVarList().end()}; + VarList.erase(std::remove_if(VarList.begin(), VarList.end(), + [&](Expr *E) { + return SemaRef.CheckVarIsPointerType( + OpenACCClauseKind::Attach, E); + }), + VarList.end()); + Clause.setVarListDetails(VarList, + /*IsReadOnly=*/false, /*IsZero=*/false); + return OpenACCAttachClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getLParenLoc(), Clause.getVarList(), + Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitDevicePtrClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // ActOnVar ensured that everything is a valid variable reference, but we + // still have to make sure it is a pointer type. + llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(), + Clause.getVarList().end()}; + VarList.erase(std::remove_if(VarList.begin(), VarList.end(), + [&](Expr *E) { + return SemaRef.CheckVarIsPointerType( + OpenACCClauseKind::DevicePtr, E); + }), + VarList.end()); + Clause.setVarListDetails(VarList, + /*IsReadOnly=*/false, /*IsZero=*/false); + + return OpenACCDevicePtrClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getVarList(), + Clause.getEndLoc()); +} - return OpenACCCopyClause::Create( - getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::PresentOrCopyIn: - case OpenACCClauseKind::PCopyIn: - Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) - << Clause.getClauseKind() << OpenACCClauseKind::CopyIn; - LLVM_FALLTHROUGH; - case OpenACCClauseKind::CopyIn: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; +OpenACCClause *SemaOpenACCClauseVisitor::VisitWaitClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + return OpenACCWaitClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getDevNumExpr(), + Clause.getQueuesLoc(), Clause.getQueueIdExprs(), Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. +OpenACCClause *SemaOpenACCClauseVisitor::VisitDeviceTypeClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' and 'loop' + // constructs, and 'compute'/'loop' constructs are the only construct that + // can do anything with this yet, so skip/treat as unimplemented in this + // case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) && + Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) + return isNotImplemented(); + + // TODO OpenACC: Once we get enough of the CodeGen implemented that we have + // a source for the list of valid architectures, we need to warn on unknown + // identifiers here. + + return OpenACCDeviceTypeClause::Create( + Ctx, Clause.getClauseKind(), Clause.getBeginLoc(), Clause.getLParenLoc(), + Clause.getDeviceTypeArchitectures(), Clause.getEndLoc()); +} - return OpenACCCopyInClause::Create( - getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.isReadOnly(), Clause.getVarList(), - Clause.getEndLoc()); +OpenACCClause *SemaOpenACCClauseVisitor::VisitAutoClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'loop' constructs, and it is + // the only construct that can do anything with this, so skip/treat as + // unimplemented for the combined constructs. + if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) + return isNotImplemented(); + + // OpenACC 3.3 2.9: + // Only one of the seq, independent, and auto clauses may appear. + const auto *Itr = + llvm::find_if(ExistingClauses, + llvm::IsaPred<OpenACCIndependentClause, OpenACCSeqClause>); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) + << Clause.getClauseKind() << Clause.getDirectiveKind(); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + return nullptr; } - case OpenACCClauseKind::PresentOrCopyOut: - case OpenACCClauseKind::PCopyOut: - Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) - << Clause.getClauseKind() << OpenACCClauseKind::CopyOut; - LLVM_FALLTHROUGH; - case OpenACCClauseKind::CopyOut: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. + return OpenACCAutoClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getEndLoc()); +} - return OpenACCCopyOutClause::Create( - getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.isZero(), Clause.getVarList(), - Clause.getEndLoc()); +OpenACCClause *SemaOpenACCClauseVisitor::VisitIndependentClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'loop' constructs, and it is + // the only construct that can do anything with this, so skip/treat as + // unimplemented for the combined constructs. + if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) + return isNotImplemented(); + + // OpenACC 3.3 2.9: + // Only one of the seq, independent, and auto clauses may appear. + const auto *Itr = llvm::find_if( + ExistingClauses, llvm::IsaPred<OpenACCAutoClause, OpenACCSeqClause>); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) + << Clause.getClauseKind() << Clause.getDirectiveKind(); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + return nullptr; } - case OpenACCClauseKind::PresentOrCreate: - case OpenACCClauseKind::PCreate: - Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) - << Clause.getClauseKind() << OpenACCClauseKind::Create; - LLVM_FALLTHROUGH; - case OpenACCClauseKind::Create: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; - - // ActOnVar ensured that everything is a valid variable reference, so there - // really isn't anything to do here. GCC does some duplicate-finding, though - // it isn't apparent in the standard where this is justified. - return OpenACCCreateClause::Create(getASTContext(), Clause.getClauseKind(), - Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.isZero(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::Attach: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + return OpenACCIndependentClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getEndLoc()); +} - // ActOnVar ensured that everything is a valid variable reference, but we - // still have to make sure it is a pointer type. - llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(), - Clause.getVarList().end()}; - VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) { - return CheckVarIsPointerType(OpenACCClauseKind::Attach, E); - }), VarList.end()); - Clause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); - - return OpenACCAttachClause::Create(getASTContext(), Clause.getBeginLoc(), - Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); +OpenACCClause *SemaOpenACCClauseVisitor::VisitSeqClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'loop' constructs, and it is + // the only construct that can do anything with this, so skip/treat as + // unimplemented for the combined constructs. + if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) + return isNotImplemented(); + + // OpenACC 3.3 2.9: + // Only one of the seq, independent, and auto clauses may appear. + const auto *Itr = + llvm::find_if(ExistingClauses, + llvm::IsaPred<OpenACCAutoClause, OpenACCIndependentClause>); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) + << Clause.getClauseKind() << Clause.getDirectiveKind(); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + return nullptr; } - case OpenACCClauseKind::DevicePtr: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; - // ActOnVar ensured that everything is a valid variable reference, but we - // still have to make sure it is a pointer type. - llvm::SmallVector<Expr *> VarList{Clause.getVarList().begin(), - Clause.getVarList().end()}; - VarList.erase(std::remove_if(VarList.begin(), VarList.end(), [&](Expr *E) { - return CheckVarIsPointerType(OpenACCClauseKind::DevicePtr, E); - }), VarList.end()); - Clause.setVarListDetails(VarList, - /*IsReadOnly=*/false, /*IsZero=*/false); - - return OpenACCDevicePtrClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getVarList(), Clause.getEndLoc()); - } - case OpenACCClauseKind::Wait: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; + // OpenACC 3.3 2.9: + // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause + // appears. + Itr = llvm::find_if(ExistingClauses, + llvm::IsaPred<OpenACCGangClause, OpenACCWorkerClause, + OpenACCVectorClause>); - return OpenACCWaitClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getDevNumExpr(), Clause.getQueuesLoc(), Clause.getQueueIdExprs(), - Clause.getEndLoc()); + if (Itr != ExistingClauses.end()) { + SemaRef.Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine) + << Clause.getClauseKind() << (*Itr)->getClauseKind(); + SemaRef.Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + return nullptr; } - case OpenACCClauseKind::DType: - case OpenACCClauseKind::DeviceType: { - // Restrictions only properly implemented on 'compute' and 'loop' - // constructs, and 'compute'/'loop' constructs are the only construct that - // can do anything with this yet, so skip/treat as unimplemented in this - // case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) && - Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) - break; - // TODO OpenACC: Once we get enough of the CodeGen implemented that we have - // a source for the list of valid architectures, we need to warn on unknown - // identifiers here. - - return OpenACCDeviceTypeClause::Create( - getASTContext(), Clause.getClauseKind(), Clause.getBeginLoc(), - Clause.getLParenLoc(), Clause.getDeviceTypeArchitectures(), - Clause.getEndLoc()); - } - case OpenACCClauseKind::Auto: { - // Restrictions only properly implemented on 'loop' constructs, and it is - // the only construct that can do anything with this, so skip/treat as - // unimplemented for the combined constructs. - if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) - break; + // TODO OpenACC: 2.9 ~ line 2010 specifies that the associated loop has some + // restrictions when there is a 'seq' clause in place. We probably need to + // implement that. + return OpenACCSeqClause::Create(Ctx, Clause.getBeginLoc(), + Clause.getEndLoc()); +} - // OpenACC 3.3 2.9: - // Only one of the seq, independent, and auto clauses may appear. - const auto *Itr = llvm::find_if( - ExistingClauses, - llvm::IsaPred<OpenACCIndependentClause, OpenACCSeqClause>); - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) - << Clause.getClauseKind() << Clause.getDirectiveKind(); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); +OpenACCClause *SemaOpenACCClauseVisitor::VisitReductionClause( + SemaOpenACC::OpenACCParsedClause &Clause) { + // Restrictions only properly implemented on 'compute' constructs, and + // 'compute' constructs are the only construct that can do anything with + // this yet, so skip/treat as unimplemented in this case. + if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) + return isNotImplemented(); + + // OpenACC 3.3 Section 2.5.4: + // A reduction clause may not appear on a parallel construct with a + // num_gangs clause that has more than one argument. + if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel) { + auto NumGangsClauses = llvm::make_filter_range( + ExistingClauses, llvm::IsaPred<OpenACCNumGangsClause>); + + for (auto *NGC : NumGangsClauses) { + unsigned NumExprs = + cast<OpenACCNumGangsClause>(NGC)->getIntExprs().size(); + + if (NumExprs > 1) { + SemaRef.Diag(Clause.getBeginLoc(), + diag::err_acc_reduction_num_gangs_conflict) + << NumExprs; + SemaRef.Diag(NGC->getBeginLoc(), diag::note_acc_previous_clause_here); + return nullptr; + } } - - return OpenACCAutoClause::Create(getASTContext(), Clause.getBeginLoc(), - Clause.getEndLoc()); } - case OpenACCClauseKind::Independent: { - // Restrictions only properly implemented on 'loop' constructs, and it is - // the only construct that can do anything with this, so skip/treat as - // unimplemented for the combined constructs. - if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) - break; - // OpenACC 3.3 2.9: - // Only one of the seq, independent, and auto clauses may appear. - const auto *Itr = llvm::find_if( - ExistingClauses, llvm::IsaPred<OpenACCAutoClause, OpenACCSeqClause>); - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) - << Clause.getClauseKind() << Clause.getDirectiveKind(); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } + SmallVector<Expr *> ValidVars; + + for (Expr *Var : Clause.getVarList()) { + ExprResult Res = SemaRef.CheckReductionVar(Var); - return OpenACCIndependentClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getEndLoc()); + if (Res.isUsable()) + ValidVars.push_back(Res.get()); } - case OpenACCClauseKind::Seq: { - // Restrictions only properly implemented on 'loop' constructs, and it is - // the only construct that can do anything with this, so skip/treat as - // unimplemented for the combined constructs. - if (Clause.getDirectiveKind() != OpenACCDirectiveKind::Loop) - break; - // OpenACC 3.3 2.9: - // Only one of the seq, independent, and auto clauses may appear. - const auto *Itr = llvm::find_if( - ExistingClauses, - llvm::IsaPred<OpenACCAutoClause, OpenACCIndependentClause>); - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_loop_spec_conflict) - << Clause.getClauseKind() << Clause.getDirectiveKind(); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } + return OpenACCReductionClause::Create( + Ctx, Clause.getBeginLoc(), Clause.getLParenLoc(), Clause.getReductionOp(), + ValidVars, Clause.getEndLoc()); +} - // OpenACC 3.3 2.9: - // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause - // appears. - Itr = llvm::find_if(ExistingClauses, - llvm::IsaPred<OpenACCGangClause, OpenACCWorkerClause, - OpenACCVectorClause>); +} // namespace - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine) - << Clause.getClauseKind() << (*Itr)->getClauseKind(); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } +SemaOpenACC::SemaOpenACC(Sema &S) : SemaBase(S) {} - // TODO OpenACC: 2.9 ~ line 2010 specifies that the associated loop has some - // restrictions when there is a 'seq' clause in place. We probably need to - // implement that. - return OpenACCSeqClause::Create(getASTContext(), Clause.getBeginLoc(), - Clause.getEndLoc()); +SemaOpenACC::AssociatedStmtRAII::AssociatedStmtRAII(SemaOpenACC &S, + OpenACCDirectiveKind DK) + : SemaRef(S), WasInsideComputeConstruct(S.InsideComputeConstruct), + DirKind(DK) { + // Compute constructs end up taking their 'loop'. + if (DirKind == OpenACCDirectiveKind::Parallel || + DirKind == OpenACCDirectiveKind::Serial || + DirKind == OpenACCDirectiveKind::Kernels) { + SemaRef.InsideComputeConstruct = true; + SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); } - case OpenACCClauseKind::Gang: - case OpenACCClauseKind::Worker: - case OpenACCClauseKind::Vector: { - // OpenACC 3.3 2.9: - // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' clause - // appears. - const auto *Itr = - llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSeqClause>); +} - if (Itr != ExistingClauses.end()) { - Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine) - << Clause.getClauseKind() << (*Itr)->getClauseKind(); - Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); - } - // Not yet implemented, so immediately drop to the 'not yet implemented' - // diagnostic. - break; +SemaOpenACC::AssociatedStmtRAII::~AssociatedStmtRAII() { + SemaRef.InsideComputeConstruct = WasInsideComputeConstruct; + if (DirKind == OpenACCDirectiveKind::Parallel || + DirKind == OpenACCDirectiveKind::Serial || + DirKind == OpenACCDirectiveKind::Kernels) { + assert(SemaRef.ParentlessLoopConstructs.empty() && + "Didn't consume loop construct list?"); + SemaRef.ParentlessLoopConstructs.swap(ParentlessLoopConstructs); } - case OpenACCClauseKind::Reduction: { - // Restrictions only properly implemented on 'compute' constructs, and - // 'compute' constructs are the only construct that can do anything with - // this yet, so skip/treat as unimplemented in this case. - if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind())) - break; - - // OpenACC 3.3 Section 2.5.4: - // A reduction clause may not appear on a parallel construct with a - // num_gangs clause that has more than one argument. - if (Clause.getDirectiveKind() == OpenACCDirectiveKind::Parallel) { - auto NumGangsClauses = llvm::make_filter_range( - ExistingClauses, llvm::IsaPred<OpenACCNumGangsClause>); - - for (auto *NGC : NumGangsClauses) { - unsigned NumExprs = - cast<OpenACCNumGangsClause>(NGC)->getIntExprs().size(); - - if (NumExprs > 1) { - Diag(Clause.getBeginLoc(), diag::err_acc_reduction_num_gangs_conflict) - << NumExprs; - Diag(NGC->getBeginLoc(), diag::note_acc_previous_clause_here); - return nullptr; - } - } - } - - SmallVector<Expr *> ValidVars; - - for (Expr *Var : Clause.getVarList()) { - ExprResult Res = CheckReductionVar(Var); +} - if (Res.isUsable()) - ValidVars.push_back(Res.get()); - } +OpenACCClause * +SemaOpenACC::ActOnClause(ArrayRef<const OpenACCClause *> ExistingClauses, + OpenACCParsedClause &Clause) { + if (Clause.getClauseKind() == OpenACCClauseKind::Invalid) + return nullptr; - return OpenACCReductionClause::Create( - getASTContext(), Clause.getBeginLoc(), Clause.getLParenLoc(), - Clause.getReductionOp(), ValidVars, Clause.getEndLoc()); + // Diagnose that we don't support this clause on this directive. + if (!doesClauseApplyToDirective(Clause.getDirectiveKind(), + Clause.getClauseKind())) { + Diag(Clause.getBeginLoc(), diag::err_acc_clause_appertainment) + << Clause.getDirectiveKind() << Clause.getClauseKind(); + return nullptr; } - default: - break; + + if (const auto *DevTypeClause = + llvm::find_if(ExistingClauses, + [&](const OpenACCClause *C) { + return isa<OpenACCDeviceTypeClause>(C); + }); + DevTypeClause != ExistingClauses.end()) { + if (checkValidAfterDeviceType( + *this, *cast<OpenACCDeviceTypeClause>(*DevTypeClause), Clause)) + return nullptr; } - Diag(Clause.getBeginLoc(), diag::warn_acc_clause_unimplemented) - << Clause.getClauseKind(); - return nullptr; + SemaOpenACCClauseVisitor Visitor{*this, ExistingClauses}; + OpenACCClause *Result = Visitor.Visit(Clause); + assert((!Result || Result->getClauseKind() == Clause.getClauseKind()) && + "Created wrong clause?"); + + if (Visitor.diagNotImplemented()) + Diag(Clause.getBeginLoc(), diag::warn_acc_clause_unimplemented) + << Clause.getClauseKind(); + + return Result; + + // switch (Clause.getClauseKind()) { + // case OpenACCClauseKind::PresentOrCopy: + // case OpenACCClauseKind::PCopy: + // Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) + // << Clause.getClauseKind() << OpenACCClauseKind::Copy; + // LLVM_FALLTHROUGH; + // case OpenACCClauseKind::PresentOrCreate: + // case OpenACCClauseKind::PCreate: + // Diag(Clause.getBeginLoc(), diag::warn_acc_deprecated_alias_name) + // << Clause.getClauseKind() << OpenACCClauseKind::Create; + // LLVM_FALLTHROUGH; + // + // + // + // + // case OpenACCClauseKind::DType: + // + // + // + // + // + // + // + // + // case OpenACCClauseKind::Gang: + // case OpenACCClauseKind::Worker: + // case OpenACCClauseKind::Vector: { + // // OpenACC 3.3 2.9: + // // A 'gang', 'worker', or 'vector' clause may not appear if a 'seq' + // clause + // // appears. + // const auto *Itr = + // llvm::find_if(ExistingClauses, llvm::IsaPred<OpenACCSeqClause>); + // + // if (Itr != ExistingClauses.end()) { + // Diag(Clause.getBeginLoc(), diag::err_acc_clause_cannot_combine) + // << Clause.getClauseKind() << (*Itr)->getClauseKind(); + // Diag((*Itr)->getBeginLoc(), diag::note_acc_previous_clause_here); + // } + // // Not yet implemented, so immediately drop to the 'not yet implemented' + // // diagnostic. + // break; + // } + // */ + } /// OpenACC 3.3 section 2.5.15: diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 95dd356..3bfda09 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -14172,13 +14172,6 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr( if (TransformExprs(E->getArgs(), E->getNumArgs(), true, Args, &ArgumentChanged)) return ExprError(); - - if (E->isListInitialization() && !E->isStdInitListInitialization()) { - ExprResult Res = RebuildInitList(E->getBeginLoc(), Args, E->getEndLoc()); - if (Res.isInvalid()) - return ExprError(); - Args = {Res.get()}; - } } if (!getDerived().AlwaysRebuild() && @@ -14190,9 +14183,12 @@ TreeTransform<Derived>::TransformCXXTemporaryObjectExpr( return SemaRef.MaybeBindToTemporary(E); } + // FIXME: We should just pass E->isListInitialization(), but we're not + // prepared to handle list-initialization without a child InitListExpr. SourceLocation LParenLoc = T->getTypeLoc().getEndLoc(); return getDerived().RebuildCXXTemporaryObjectExpr( - T, LParenLoc, Args, E->getEndLoc(), E->isListInitialization()); + T, LParenLoc, Args, E->getEndLoc(), + /*ListInitialization=*/LParenLoc.isInvalid()); } template<typename Derived> diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp index 290d966..197d673 100644 --- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp +++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp @@ -1971,45 +1971,33 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred, ExplodedNodeSet Tmp; StmtNodeBuilder Bldr2(PreVisit, Tmp, *currBldrCtx); - bool HasRewrittenInit = false; - const Expr *ArgE = nullptr; - if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S)) { + const Expr *ArgE; + if (const auto *DefE = dyn_cast<CXXDefaultArgExpr>(S)) ArgE = DefE->getExpr(); - HasRewrittenInit = DefE->hasRewrittenInit(); - } else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S)) { + else if (const auto *DefE = dyn_cast<CXXDefaultInitExpr>(S)) ArgE = DefE->getExpr(); - HasRewrittenInit = DefE->hasRewrittenInit(); - } else + else llvm_unreachable("unknown constant wrapper kind"); - if (HasRewrittenInit) { - for (auto *N : PreVisit) { - ProgramStateRef state = N->getState(); - const LocationContext *LCtx = N->getLocationContext(); - state = state->BindExpr(S, LCtx, state->getSVal(ArgE, LCtx)); - Bldr2.generateNode(S, N, state); - } - } else { - // If it's not rewritten, the contents of these expressions are not - // actually part of the current function, so we fall back to constant - // evaluation. - bool IsTemporary = false; - if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) { - ArgE = MTE->getSubExpr(); - IsTemporary = true; - } - - std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE); - const LocationContext *LCtx = Pred->getLocationContext(); - for (auto *I : PreVisit) { - ProgramStateRef State = I->getState(); - State = State->BindExpr(S, LCtx, ConstantVal.value_or(UnknownVal())); - if (IsTemporary) - State = createTemporaryRegionIfNeeded(State, LCtx, cast<Expr>(S), - cast<Expr>(S)); + bool IsTemporary = false; + if (const auto *MTE = dyn_cast<MaterializeTemporaryExpr>(ArgE)) { + ArgE = MTE->getSubExpr(); + IsTemporary = true; + } - Bldr2.generateNode(S, I, State); - } + std::optional<SVal> ConstantVal = svalBuilder.getConstantVal(ArgE); + if (!ConstantVal) + ConstantVal = UnknownVal(); + + const LocationContext *LCtx = Pred->getLocationContext(); + for (const auto I : PreVisit) { + ProgramStateRef State = I->getState(); + State = State->BindExpr(S, LCtx, *ConstantVal); + if (IsTemporary) + State = createTemporaryRegionIfNeeded(State, LCtx, + cast<Expr>(S), + cast<Expr>(S)); + Bldr2.generateNode(S, I, State); } getCheckerManager().runCheckersForPostStmt(Dst, Tmp, S, *this); diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp index 66a2f6e..0cab17a 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp @@ -42,7 +42,7 @@ DependencyScanningWorkerFilesystem::readFile(StringRef Filename) { } bool DependencyScanningWorkerFilesystem::ensureDirectiveTokensArePopulated( - EntryRef Ref, const LangOptions &LangOpts) { + EntryRef Ref) { auto &Entry = Ref.Entry; if (Entry.isError() || Entry.isDirectory()) @@ -66,7 +66,7 @@ bool DependencyScanningWorkerFilesystem::ensureDirectiveTokensArePopulated( // dependencies. if (scanSourceForDependencyDirectives(Contents->Original->getBuffer(), Contents->DepDirectiveTokens, - Directives, LangOpts)) { + Directives)) { Contents->DepDirectiveTokens.clear(); // FIXME: Propagate the diagnostic if desired by the client. Contents->DepDirectives.store(new std::optional<DependencyDirectivesTy>()); diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp index 07e1960d..0f82f22 100644 --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp @@ -364,12 +364,11 @@ public: // Use the dependency scanning optimized file system if requested to do so. if (DepFS) ScanInstance.getPreprocessorOpts().DependencyDirectivesForFile = - [LocalDepFS = DepFS, - &LangOpts = ScanInstance.getLangOpts()](FileEntryRef File) + [LocalDepFS = DepFS](FileEntryRef File) -> std::optional<ArrayRef<dependency_directives_scan::Directive>> { if (llvm::ErrorOr<EntryRef> Entry = LocalDepFS->getOrCreateFileSystemEntry(File.getName())) - if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry, LangOpts)) + if (LocalDepFS->ensureDirectiveTokensArePopulated(*Entry)) return Entry->getDirectiveTokens(); return std::nullopt; }; diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp index dd5064d..6f6fca8 100644 --- a/clang/test/AST/Interp/arrays.cpp +++ b/clang/test/AST/Interp/arrays.cpp @@ -609,3 +609,17 @@ namespace ArrayMemberAccess { bool cond = a->x; } } + +namespace OnePastEndSub { + struct A {}; + constexpr A a[3][3]; + constexpr int diff2 = &a[1][3] - &a[1][0]; /// Used to crash. +} + +static int same_entity_2[3]; +constexpr int *get2() { + // This is a redeclaration of the same entity, even though it doesn't + // inherit the type of the prior declaration. + extern int same_entity_2[]; + return same_entity_2; +} diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp index c91d52c..1efd784 100644 --- a/clang/test/AST/Interp/cxx23.cpp +++ b/clang/test/AST/Interp/cxx23.cpp @@ -178,3 +178,25 @@ namespace ExplicitLambdaThis { }; static_assert(f()); } + +namespace std { + struct strong_ordering { + int n; + constexpr operator int() const { return n; } + static const strong_ordering less, equal, greater; + }; + constexpr strong_ordering strong_ordering::less = {-1}; + constexpr strong_ordering strong_ordering::equal = {0}; + constexpr strong_ordering strong_ordering::greater = {1}; +} + +namespace UndefinedThreeWay { + struct A { + friend constexpr std::strong_ordering operator<=>(const A&, const A&) = default; // all-note {{declared here}} + }; + + constexpr std::strong_ordering operator<=>(const A&, const A&) noexcept; + constexpr std::strong_ordering (*test_a_threeway)(const A&, const A&) = &operator<=>; + static_assert(!(*test_a_threeway)(A(), A())); // all-error {{static assertion expression is not an integral constant expression}} \ + // all-note {{undefined function 'operator<=>' cannot be used in a constant expression}} +} diff --git a/clang/test/AST/Interp/eval-order.cpp b/clang/test/AST/Interp/eval-order.cpp index aaf2b74..7a7ce6a 100644 --- a/clang/test/AST/Interp/eval-order.cpp +++ b/clang/test/AST/Interp/eval-order.cpp @@ -71,8 +71,8 @@ namespace EvalOrder { // Rules 1 and 2 have no effect ('b' is not an expression). // Rule 3: a->*b - // SEQ(A(ud).*B(&UserDefined::n)); FIXME - // SEQ(A(&ud)->*B(&UserDefined::n)); FIXME + SEQ(A(ud).*B(&UserDefined::n)); + SEQ(A(&ud)->*B(&UserDefined::n)); // Rule 4: a(b1, b2, b3) SEQ(A(f)(B(1), B(2), B(3))); // expected-error {{not an integral constant expression}} FIXME \ diff --git a/clang/test/AST/Interp/literals.cpp b/clang/test/AST/Interp/literals.cpp index c160be0..5a29013 100644 --- a/clang/test/AST/Interp/literals.cpp +++ b/clang/test/AST/Interp/literals.cpp @@ -66,7 +66,12 @@ namespace ScalarTypes { First = 0, }; static_assert(getScalar<E>() == First, ""); - /// FIXME: Member pointers. + + struct S { + int v; + }; + constexpr int S::* MemberPtr = &S::v; + static_assert(getScalar<decltype(MemberPtr)>() == nullptr, ""); #if __cplusplus >= 201402L constexpr void Void(int n) { @@ -1204,7 +1209,7 @@ namespace incdecbool { constexpr int externvar1() { // both-error {{never produces a constant expression}} extern char arr[]; // ref-note {{declared here}} return arr[0]; // ref-note {{read of non-constexpr variable 'arr'}} \ - // expected-note {{array-to-pointer decay of array member without known bound is not supported}} + // expected-note {{indexing of array without known bound}} } #endif diff --git a/clang/test/AST/Interp/memberpointers.cpp b/clang/test/AST/Interp/memberpointers.cpp new file mode 100644 index 0000000..54d73fe --- /dev/null +++ b/clang/test/AST/Interp/memberpointers.cpp @@ -0,0 +1,197 @@ +// RUN: %clang_cc1 -std=c++14 -fexperimental-new-constant-interpreter -verify=expected,both %s +// RUN: %clang_cc1 -std=c++14 -verify=ref,both %s + +namespace MemberPointers { + struct A { + constexpr A(int n) : n(n) {} + int n; + constexpr int f() const { return n + 3; } + }; + + constexpr A a(7); + static_assert(A(5).*&A::n == 5, ""); + static_assert((&a)->*&A::n == 7, ""); + static_assert((A(8).*&A::f)() == 11, ""); + static_assert(((&a)->*&A::f)() == 10, ""); + + struct B : A { + constexpr B(int n, int m) : A(n), m(m) {} + int m; + constexpr int g() const { return n + m + 1; } + }; + constexpr B b(9, 13); + static_assert(B(4, 11).*&A::n == 4, ""); + static_assert(B(4, 11).*&B::m == 11, ""); + static_assert(B(4, 11).m == 11, ""); + static_assert(B(4, 11).*(int(A::*))&B::m == 11, ""); + static_assert(B(4, 11).*&B::m == 11, ""); + static_assert((&b)->*&A::n == 9, ""); + static_assert((&b)->*&B::m == 13, ""); + static_assert((&b)->*(int(A::*))&B::m == 13, ""); + static_assert((B(4, 11).*&A::f)() == 7, ""); + static_assert((B(4, 11).*&B::g)() == 16, ""); + + static_assert((B(4, 11).*(int(A::*)() const)&B::g)() == 16, ""); + + static_assert(((&b)->*&A::f)() == 12, ""); + static_assert(((&b)->*&B::g)() == 23, ""); + static_assert(((&b)->*(int(A::*)()const)&B::g)() == 23, ""); + + + struct S { + constexpr S(int m, int n, int (S::*pf)() const, int S::*pn) : + m(m), n(n), pf(pf), pn(pn) {} + constexpr S() : m(), n(), pf(&S::f), pn(&S::n) {} + + constexpr int f() const { return this->*pn; } + virtual int g() const; + + int m, n; + int (S::*pf)() const; + int S::*pn; + }; + + constexpr int S::*pm = &S::m; + constexpr int S::*pn = &S::n; + + constexpr int (S::*pf)() const = &S::f; + constexpr int (S::*pg)() const = &S::g; + + constexpr S s(2, 5, &S::f, &S::m); + + static_assert((s.*&S::f)() == 2, ""); + static_assert((s.*s.pf)() == 2, ""); + + static_assert(pf == &S::f, ""); + + static_assert(pf == s.*&S::pf, ""); + + static_assert(pm == &S::m, ""); + static_assert(pm != pn, ""); + static_assert(s.pn != pn, ""); + static_assert(s.pn == pm, ""); + static_assert(pg != nullptr, ""); + static_assert(pf != nullptr, ""); + static_assert((int S::*)nullptr == nullptr, ""); + static_assert(pg == pg, ""); // both-error {{constant expression}} \ + // both-note {{comparison of pointer to virtual member function 'g' has unspecified value}} + static_assert(pf != pg, ""); // both-error {{constant expression}} \ + // both-note {{comparison of pointer to virtual member function 'g' has unspecified value}} + + template<int n> struct T : T<n-1> { const int X = n;}; + template<> struct T<0> { int n; char k;}; + template<> struct T<30> : T<29> { int m; }; + + T<17> t17; + T<30> t30; + + constexpr int (T<15>::*deepm) = (int(T<10>::*))&T<30>::m; + constexpr int (T<10>::*deepn) = &T<0>::n; + constexpr char (T<10>::*deepk) = &T<0>::k; + + static_assert(&(t17.*deepn) == &t17.n, ""); + static_assert(&(t17.*deepk) == &t17.k, ""); + static_assert(deepn == &T<2>::n, ""); + + constexpr int *pgood = &(t30.*deepm); + constexpr int *pbad = &(t17.*deepm); // both-error {{constant expression}} + static_assert(&(t30.*deepm) == &t30.m, ""); + + static_assert(deepm == &T<50>::m, ""); + static_assert(deepm != deepn, ""); + + constexpr T<5> *p17_5 = &t17; + constexpr T<13> *p17_13 = (T<13>*)p17_5; + constexpr T<23> *p17_23 = (T<23>*)p17_13; // both-error {{constant expression}} \ + // both-note {{cannot cast object of dynamic type 'T<17>' to type 'T<23>'}} + constexpr T<18> *p17_18 = (T<18>*)p17_13; // both-error {{constant expression}} \ + // both-note {{cannot cast object of dynamic type 'T<17>' to type 'T<18>'}} + static_assert(&(p17_5->*(int(T<0>::*))deepn) == &t17.n, ""); + static_assert(&(p17_5->*(int(T<0>::*))deepn), ""); + + + static_assert(&(p17_13->*deepn) == &t17.n, ""); + constexpr int *pbad2 = &(p17_13->*(int(T<9>::*))deepm); // both-error {{constant expression}} + + constexpr T<5> *p30_5 = &t30; + constexpr T<23> *p30_23 = (T<23>*)p30_5; + constexpr T<13> *p30_13 = p30_23; + static_assert(&(p30_13->*deepn) == &t30.n, ""); + static_assert(&(p30_23->*deepn) == &t30.n, ""); + static_assert(&(p30_5->*(int(T<3>::*))deepn) == &t30.n, ""); + + static_assert(&(p30_5->*(int(T<2>::*))deepm) == &t30.m, ""); + static_assert(&(((T<17>*)p30_13)->*deepm) == &t30.m, ""); + static_assert(&(p30_23->*deepm) == &t30.m, ""); + + + /// Added tests not from constant-expression-cxx11.cpp + static_assert(pm, ""); + static_assert(!((int S::*)nullptr), ""); + constexpr int S::*pk = nullptr; + static_assert(!pk, ""); +} + +namespace test3 { + struct nsCSSRect { + }; + static int nsCSSRect::* sides; + nsCSSRect dimenX; + void ParseBoxCornerRadii(int y) { + switch (y) { + } + int& x = dimenX.*sides; + } +} + +void foo() { + class X; + void (X::*d) (); + d = nullptr; /// This calls in the constant interpreter. +} + +namespace { + struct A { int n; }; + struct B { int n; }; + struct C : A, B {}; + struct D { double d; C c; }; + const int &&u = static_cast<B&&>(0, ((D&&)D{}).*&D::c).n; // both-warning {{left operand of comma operator has no effect}} +} + +/// From SemaTemplate/instantiate-member-pointers.cpp +namespace { + struct Y { + int x; + }; + + template<typename T, typename Class, T Class::*Ptr> + struct X3 { + X3<T, Class, Ptr> &operator=(const T& value) { + return *this; + } + }; + + typedef int Y::*IntMember; + template<IntMember Member> + struct X4 { + X3<int, Y, Member> member; + int &getMember(Y& y) { return y.*Member; } + }; + + int &get_X4(X4<&Y::x> x4, Y& y) { + return x4.getMember(y); + } +} + +/// From test/CXX/basic/basic.def.odr/p2.cpp +namespace { + void use(int); + struct S { int x; int f() const; }; + constexpr S *ps = nullptr; + S *const &psr = ps; + + void test() { + use(ps->*&S::x); + use(psr->*&S::x); + } +} diff --git a/clang/test/AST/ast-dump-default-init-json.cpp b/clang/test/AST/ast-dump-default-init-json.cpp index f4949a9..1058b4e 100644 --- a/clang/test/AST/ast-dump-default-init-json.cpp +++ b/clang/test/AST/ast-dump-default-init-json.cpp @@ -789,10 +789,10 @@ void test() { // CHECK-NEXT: "valueCategory": "lvalue", // CHECK-NEXT: "extendingDecl": { // CHECK-NEXT: "id": "0x{{.*}}", -// CHECK-NEXT: "kind": "VarDecl", -// CHECK-NEXT: "name": "b", +// CHECK-NEXT: "kind": "FieldDecl", +// CHECK-NEXT: "name": "a", // CHECK-NEXT: "type": { -// CHECK-NEXT: "qualType": "B" +// CHECK-NEXT: "qualType": "const A &" // CHECK-NEXT: } // CHECK-NEXT: }, // CHECK-NEXT: "storageDuration": "automatic", diff --git a/clang/test/AST/ast-dump-default-init.cpp b/clang/test/AST/ast-dump-default-init.cpp index 26864fb..15b29f0 100644 --- a/clang/test/AST/ast-dump-default-init.cpp +++ b/clang/test/AST/ast-dump-default-init.cpp @@ -13,7 +13,7 @@ void test() { } // CHECK: -CXXDefaultInitExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue has rewritten init // CHECK-NEXT: `-ExprWithCleanups 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue -// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Var 0x{{[^ ]*}} 'b' 'B' +// CHECK-NEXT: `-MaterializeTemporaryExpr 0x{{[^ ]*}} <{{.*}}> 'const A' lvalue extended by Field 0x{{[^ ]*}} 'a' 'const A &' // CHECK-NEXT: `-ImplicitCastExpr 0x{{[^ ]*}} <{{.*}}> 'const A' <NoOp> // CHECK-NEXT: `-CXXFunctionalCastExpr 0x{{[^ ]*}} <{{.*}}> 'A' functional cast to A <NoOp> // CHECK-NEXT: `-InitListExpr 0x{{[^ ]*}} <{{.*}}> 'A' diff --git a/clang/test/AST/ast-print-openacc-loop-construct.cpp b/clang/test/AST/ast-print-openacc-loop-construct.cpp index 519825b..cde302a 100644 --- a/clang/test/AST/ast-print-openacc-loop-construct.cpp +++ b/clang/test/AST/ast-print-openacc-loop-construct.cpp @@ -48,4 +48,13 @@ void foo() { // CHECK-NEXT: ; #pragma acc loop auto for(;;); + + int i; + float array[5]; + +// CHECK: #pragma acc loop private(i, array[1], array, array[1:2]) +// CHECK-NEXT: for (;;) +// CHECK-NEXT: ; +#pragma acc loop private(i, array[1], array, array[1:2]) + for(;;); } diff --git a/clang/test/Analysis/cxx-uninitialized-object.cpp b/clang/test/Analysis/cxx-uninitialized-object.cpp index aee0dae..e3fa8ae 100644 --- a/clang/test/Analysis/cxx-uninitialized-object.cpp +++ b/clang/test/Analysis/cxx-uninitialized-object.cpp @@ -1114,27 +1114,27 @@ void fCXX11MemberInitTest1() { CXX11MemberInitTest1(); } -#ifdef PEDANTIC struct CXX11MemberInitTest2 { struct RecordType { - int a; // expected-note {{uninitialized field 'this->a'}} - int b; // expected-note {{uninitialized field 'this->b'}} + // TODO: we'd expect the note: {{uninitialized field 'this->rec.a'}} + int a; // no-note + // TODO: we'd expect the note: {{uninitialized field 'this->rec.b'}} + int b; // no-note RecordType(int) {} }; - RecordType rec = RecordType(int()); // expected-warning {{2 uninitialized fields}} + RecordType rec = RecordType(int()); int dontGetFilteredByNonPedanticMode = 0; CXX11MemberInitTest2() {} }; void fCXX11MemberInitTest2() { + // TODO: we'd expect the warning: {{2 uninitializeds field}} CXX11MemberInitTest2(); // no-warning } -#endif // PEDANTIC - //===----------------------------------------------------------------------===// // "Esoteric" primitive type tests. //===----------------------------------------------------------------------===// diff --git a/clang/test/Analysis/lifetime-extended-regions.cpp b/clang/test/Analysis/lifetime-extended-regions.cpp index 524f4e0..4e98bd4 100644 --- a/clang/test/Analysis/lifetime-extended-regions.cpp +++ b/clang/test/Analysis/lifetime-extended-regions.cpp @@ -120,11 +120,11 @@ void aggregateWithReferences() { clang_analyzer_dump(viaReference); // expected-warning-re {{&lifetime_extended_object{RefAggregate, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.rx); // expected-warning-re {{&lifetime_extended_object{int, viaReference, S{{[0-9]+}}} }} clang_analyzer_dump(viaReference.ry); // expected-warning-re {{&lifetime_extended_object{Composite, viaReference, S{{[0-9]+}}} }} - - // The lifetime lifetime of object bound to reference members of aggregates, - // that are created from default member initializer was extended. - RefAggregate defaultInitExtended{i}; - clang_analyzer_dump(defaultInitExtended.ry); // expected-warning-re {{&lifetime_extended_object{Composite, defaultInitExtended, S{{[0-9]+}}} }} + + // clang does not currently implement extending lifetime of object bound to reference members of aggregates, + // that are created from default member initializer (see `warn_unsupported_lifetime_extension` from `-Wdangling`) + RefAggregate defaultInitExtended{i}; // clang-bug does not extend `Composite` + clang_analyzer_dump(defaultInitExtended.ry); // expected-warning {{Unknown }} } void lambda() { diff --git a/clang/test/CXX/drs/cwg16xx.cpp b/clang/test/CXX/drs/cwg16xx.cpp index 82ef871..cf6b45c 100644 --- a/clang/test/CXX/drs/cwg16xx.cpp +++ b/clang/test/CXX/drs/cwg16xx.cpp @@ -483,6 +483,8 @@ namespace cwg1696 { // cwg1696: 7 const A &a = A(); // #cwg1696-D1-a }; D1 d1 = {}; // #cwg1696-d1 + // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} + // since-cxx14-note@#cwg1696-D1-a {{initializing field 'a' with default member initializer}} struct D2 { const A &a = A(); // #cwg1696-D2-a diff --git a/clang/test/CXX/drs/cwg18xx.cpp b/clang/test/CXX/drs/cwg18xx.cpp index 054ce5a..323e56f 100644 --- a/clang/test/CXX/drs/cwg18xx.cpp +++ b/clang/test/CXX/drs/cwg18xx.cpp @@ -206,28 +206,19 @@ namespace cwg1814 { // cwg1814: yes #endif } -namespace cwg1815 { // cwg1815: 19 +namespace cwg1815 { // cwg1815: no #if __cplusplus >= 201402L - struct A { int &&r = 0; }; + // FIXME: needs codegen test + struct A { int &&r = 0; }; // #cwg1815-A A a = {}; + // since-cxx14-warning@-1 {{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported; lifetime of temporary will end at the end of the full-expression}} FIXME + // since-cxx14-note@#cwg1815-A {{initializing field 'r' with default member initializer}} struct B { int &&r = 0; }; // #cwg1815-B // since-cxx14-error@-1 {{reference member 'r' binds to a temporary object whose lifetime would be shorter than the lifetime of the constructed object}} // since-cxx14-note@#cwg1815-B {{initializing field 'r' with default member initializer}} // since-cxx14-note@#cwg1815-b {{in implicit default constructor for 'cwg1815::B' first required here}} B b; // #cwg1815-b - -#if __cplusplus >= 201703L - struct C { const int &r = 0; }; - constexpr C c = {}; // OK, since cwg1815 - static_assert(c.r == 0); - - constexpr int f() { - A a = {}; // OK, since cwg1815 - return a.r; - } - static_assert(f() == 0); -#endif #endif } diff --git a/clang/test/CXX/special/class.temporary/p6.cpp b/clang/test/CXX/special/class.temporary/p6.cpp index a6d2adf..5554363 100644 --- a/clang/test/CXX/special/class.temporary/p6.cpp +++ b/clang/test/CXX/special/class.temporary/p6.cpp @@ -269,40 +269,6 @@ void init_capture_init_list() { // CHECK: } } -void check_dr1815() { // dr1815: yes -#if __cplusplus >= 201402L - - struct A { - int &&r = 0; - ~A() {} - }; - - struct B { - A &&a = A{}; - ~B() {} - }; - B a = {}; - - // CHECK: call {{.*}}block_scope_begin_function - extern void block_scope_begin_function(); - extern void block_scope_end_function(); - block_scope_begin_function(); - { - // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev - // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev - B b = {}; - } - // CHECK: call {{.*}}block_scope_end_function - block_scope_end_function(); - - // CHECK: call {{.*}}some_other_function - extern void some_other_function(); - some_other_function(); - // CHECK: call void @_ZZ12check_dr1815vEN1BD1Ev - // CHECK: call void @_ZZ12check_dr1815vEN1AD1Ev -#endif -} - namespace P2718R0 { namespace basic { template <typename E> using T2 = std::list<E>; diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcpopv.c index 13748be..b87b225 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcpopv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcpopv.c @@ -16,399 +16,399 @@ #include <riscv_vector.h> -// CHECK-LABEL: @test_vcpopv_v_u8mf8( +// CHECK-LABEL: @test_vcpop_v_u8mf8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.nxv1i8.i64(<vscale x 1 x i8> poison, <vscale x 1 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8(vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8(vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8(vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4( +// CHECK-LABEL: @test_vcpop_v_u8mf4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.nxv2i8.i64(<vscale x 2 x i8> poison, <vscale x 2 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4(vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4(vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4(vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2( +// CHECK-LABEL: @test_vcpop_v_u8mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.nxv4i8.i64(<vscale x 4 x i8> poison, <vscale x 4 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2(vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2(vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2(vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1( +// CHECK-LABEL: @test_vcpop_v_u8m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1(vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1(vs2, vl); +vuint8m1_t test_vcpop_v_u8m1(vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2( +// CHECK-LABEL: @test_vcpop_v_u8m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2(vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2(vs2, vl); +vuint8m2_t test_vcpop_v_u8m2(vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4( +// CHECK-LABEL: @test_vcpop_v_u8m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.nxv32i8.i64(<vscale x 32 x i8> poison, <vscale x 32 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4(vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4(vs2, vl); +vuint8m4_t test_vcpop_v_u8m4(vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8( +// CHECK-LABEL: @test_vcpop_v_u8m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8(vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8(vs2, vl); +vuint8m8_t test_vcpop_v_u8m8(vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4( +// CHECK-LABEL: @test_vcpop_v_u16mf4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.nxv1i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4(vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4(vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4(vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2( +// CHECK-LABEL: @test_vcpop_v_u16mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.nxv2i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2(vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2(vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2(vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1( +// CHECK-LABEL: @test_vcpop_v_u16m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.nxv4i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1(vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1(vs2, vl); +vuint16m1_t test_vcpop_v_u16m1(vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2( +// CHECK-LABEL: @test_vcpop_v_u16m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.nxv8i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2(vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2(vs2, vl); +vuint16m2_t test_vcpop_v_u16m2(vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4( +// CHECK-LABEL: @test_vcpop_v_u16m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4(vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4(vs2, vl); +vuint16m4_t test_vcpop_v_u16m4(vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8( +// CHECK-LABEL: @test_vcpop_v_u16m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.nxv32i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8(vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8(vs2, vl); +vuint16m8_t test_vcpop_v_u16m8(vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2( +// CHECK-LABEL: @test_vcpop_v_u32mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.nxv1i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2(vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2(vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2(vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1( +// CHECK-LABEL: @test_vcpop_v_u32m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1(vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1(vs2, vl); +vuint32m1_t test_vcpop_v_u32m1(vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2( +// CHECK-LABEL: @test_vcpop_v_u32m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.nxv4i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2(vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2(vs2, vl); +vuint32m2_t test_vcpop_v_u32m2(vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4( +// CHECK-LABEL: @test_vcpop_v_u32m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4(vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4(vs2, vl); +vuint32m4_t test_vcpop_v_u32m4(vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8( +// CHECK-LABEL: @test_vcpop_v_u32m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8(vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8(vs2, vl); +vuint32m8_t test_vcpop_v_u32m8(vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1( +// CHECK-LABEL: @test_vcpop_v_u64m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.nxv1i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1(vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1(vs2, vl); +vuint64m1_t test_vcpop_v_u64m1(vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2( +// CHECK-LABEL: @test_vcpop_v_u64m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.nxv2i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2(vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2(vs2, vl); +vuint64m2_t test_vcpop_v_u64m2(vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4( +// CHECK-LABEL: @test_vcpop_v_u64m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.nxv4i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4(vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4(vs2, vl); +vuint64m4_t test_vcpop_v_u64m4(vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8( +// CHECK-LABEL: @test_vcpop_v_u64m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.nxv8i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8(vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8(vs2, vl); +vuint64m8_t test_vcpop_v_u64m8(vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_m( +// CHECK-LABEL: @test_vcpop_v_u8mf8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> poison, <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_m(vbool64_t mask, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8_m(mask, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_m(vbool64_t mask, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_m( +// CHECK-LABEL: @test_vcpop_v_u8mf4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> poison, <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_m(vbool32_t mask, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4_m(mask, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_m(vbool32_t mask, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_m( +// CHECK-LABEL: @test_vcpop_v_u8mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> poison, <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_m(vbool16_t mask, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2_m(mask, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_m(vbool16_t mask, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_m( +// CHECK-LABEL: @test_vcpop_v_u8m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_m(vbool8_t mask, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1_m(mask, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_m(vbool8_t mask, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_m( +// CHECK-LABEL: @test_vcpop_v_u8m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_m(vbool4_t mask, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2_m(mask, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_m(vbool4_t mask, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_m( +// CHECK-LABEL: @test_vcpop_v_u8m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> poison, <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_m(vbool2_t mask, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4_m(mask, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_m(vbool2_t mask, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_m( +// CHECK-LABEL: @test_vcpop_v_u8m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_m(vbool1_t mask, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8_m(mask, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_m(vbool1_t mask, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_m( +// CHECK-LABEL: @test_vcpop_v_u16mf4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_m(vbool64_t mask, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4_m(mask, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_m(vbool64_t mask, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_m( +// CHECK-LABEL: @test_vcpop_v_u16mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_m(vbool32_t mask, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2_m(mask, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_m(vbool32_t mask, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_m( +// CHECK-LABEL: @test_vcpop_v_u16m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_m(vbool16_t mask, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1_m(mask, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_m(vbool16_t mask, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_m( +// CHECK-LABEL: @test_vcpop_v_u16m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_m(vbool8_t mask, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2_m(mask, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_m(vbool8_t mask, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_m( +// CHECK-LABEL: @test_vcpop_v_u16m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_m(vbool4_t mask, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4_m(mask, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_m(vbool4_t mask, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_m( +// CHECK-LABEL: @test_vcpop_v_u16m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_m(vbool2_t mask, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8_m(mask, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_m(vbool2_t mask, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_m( +// CHECK-LABEL: @test_vcpop_v_u32mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_m(vbool64_t mask, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2_m(mask, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_m(vbool64_t mask, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_m( +// CHECK-LABEL: @test_vcpop_v_u32m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_m(vbool32_t mask, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1_m(mask, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_m(vbool32_t mask, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_m( +// CHECK-LABEL: @test_vcpop_v_u32m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_m(vbool16_t mask, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2_m(mask, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_m(vbool16_t mask, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_m( +// CHECK-LABEL: @test_vcpop_v_u32m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_m(vbool8_t mask, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4_m(mask, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_m(vbool8_t mask, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_m( +// CHECK-LABEL: @test_vcpop_v_u32m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_m(vbool4_t mask, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8_m(mask, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_m(vbool4_t mask, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_m( +// CHECK-LABEL: @test_vcpop_v_u64m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_m(vbool64_t mask, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1_m(mask, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_m(vbool64_t mask, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_m( +// CHECK-LABEL: @test_vcpop_v_u64m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_m(vbool32_t mask, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2_m(mask, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_m(vbool32_t mask, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_m( +// CHECK-LABEL: @test_vcpop_v_u64m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_m(vbool16_t mask, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4_m(mask, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_m(vbool16_t mask, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4_m(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_m( +// CHECK-LABEL: @test_vcpop_v_u64m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_m(vbool8_t mask, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8_m(mask, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_m(vbool8_t mask, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8_m(mask, vs2, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vcpopv.c index adb0ac9..5625b19 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vcpopv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/overloaded/vcpopv.c @@ -16,399 +16,399 @@ #include <riscv_vector.h> -// CHECK-LABEL: @test_vcpopv_v_u8mf8( +// CHECK-LABEL: @test_vcpop_v_u8mf8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.nxv1i8.i64(<vscale x 1 x i8> poison, <vscale x 1 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8(vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8(vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4( +// CHECK-LABEL: @test_vcpop_v_u8mf4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.nxv2i8.i64(<vscale x 2 x i8> poison, <vscale x 2 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4(vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4(vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2( +// CHECK-LABEL: @test_vcpop_v_u8mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.nxv4i8.i64(<vscale x 4 x i8> poison, <vscale x 4 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2(vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2(vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1( +// CHECK-LABEL: @test_vcpop_v_u8m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1(vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8m1_t test_vcpop_v_u8m1(vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2( +// CHECK-LABEL: @test_vcpop_v_u8m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2(vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8m2_t test_vcpop_v_u8m2(vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4( +// CHECK-LABEL: @test_vcpop_v_u8m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.nxv32i8.i64(<vscale x 32 x i8> poison, <vscale x 32 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4(vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8m4_t test_vcpop_v_u8m4(vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8( +// CHECK-LABEL: @test_vcpop_v_u8m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8(vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint8m8_t test_vcpop_v_u8m8(vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4( +// CHECK-LABEL: @test_vcpop_v_u16mf4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.nxv1i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4(vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4(vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2( +// CHECK-LABEL: @test_vcpop_v_u16mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.nxv2i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2(vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2(vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1( +// CHECK-LABEL: @test_vcpop_v_u16m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.nxv4i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1(vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16m1_t test_vcpop_v_u16m1(vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2( +// CHECK-LABEL: @test_vcpop_v_u16m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.nxv8i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2(vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16m2_t test_vcpop_v_u16m2(vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4( +// CHECK-LABEL: @test_vcpop_v_u16m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4(vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16m4_t test_vcpop_v_u16m4(vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8( +// CHECK-LABEL: @test_vcpop_v_u16m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.nxv32i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8(vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint16m8_t test_vcpop_v_u16m8(vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2( +// CHECK-LABEL: @test_vcpop_v_u32mf2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.nxv1i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2(vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2(vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1( +// CHECK-LABEL: @test_vcpop_v_u32m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1(vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint32m1_t test_vcpop_v_u32m1(vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2( +// CHECK-LABEL: @test_vcpop_v_u32m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.nxv4i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2(vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint32m2_t test_vcpop_v_u32m2(vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4( +// CHECK-LABEL: @test_vcpop_v_u32m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4(vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint32m4_t test_vcpop_v_u32m4(vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8( +// CHECK-LABEL: @test_vcpop_v_u32m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8(vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint32m8_t test_vcpop_v_u32m8(vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1( +// CHECK-LABEL: @test_vcpop_v_u64m1( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.nxv1i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1(vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint64m1_t test_vcpop_v_u64m1(vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2( +// CHECK-LABEL: @test_vcpop_v_u64m2( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.nxv2i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2(vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint64m2_t test_vcpop_v_u64m2(vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4( +// CHECK-LABEL: @test_vcpop_v_u64m4( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.nxv4i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4(vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint64m4_t test_vcpop_v_u64m4(vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8( +// CHECK-LABEL: @test_vcpop_v_u64m8( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.nxv8i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8(vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv(vs2, vl); +vuint64m8_t test_vcpop_v_u64m8(vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop(vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_m( +// CHECK-LABEL: @test_vcpop_v_u8mf8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> poison, <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_m(vbool64_t mask, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_m(vbool64_t mask, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_m( +// CHECK-LABEL: @test_vcpop_v_u8mf4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> poison, <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_m(vbool32_t mask, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_m(vbool32_t mask, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_m( +// CHECK-LABEL: @test_vcpop_v_u8mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> poison, <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_m(vbool16_t mask, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_m(vbool16_t mask, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_m( +// CHECK-LABEL: @test_vcpop_v_u8m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> poison, <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_m(vbool8_t mask, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_m(vbool8_t mask, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_m( +// CHECK-LABEL: @test_vcpop_v_u8m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> poison, <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_m(vbool4_t mask, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_m(vbool4_t mask, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_m( +// CHECK-LABEL: @test_vcpop_v_u8m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> poison, <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_m(vbool2_t mask, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_m(vbool2_t mask, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_m( +// CHECK-LABEL: @test_vcpop_v_u8m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> poison, <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_m(vbool1_t mask, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_m(vbool1_t mask, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_m( +// CHECK-LABEL: @test_vcpop_v_u16mf4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> poison, <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_m(vbool64_t mask, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_m(vbool64_t mask, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_m( +// CHECK-LABEL: @test_vcpop_v_u16mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> poison, <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_m(vbool32_t mask, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_m(vbool32_t mask, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_m( +// CHECK-LABEL: @test_vcpop_v_u16m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> poison, <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_m(vbool16_t mask, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_m(vbool16_t mask, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_m( +// CHECK-LABEL: @test_vcpop_v_u16m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> poison, <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_m(vbool8_t mask, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_m(vbool8_t mask, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_m( +// CHECK-LABEL: @test_vcpop_v_u16m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> poison, <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_m(vbool4_t mask, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_m(vbool4_t mask, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_m( +// CHECK-LABEL: @test_vcpop_v_u16m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> poison, <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_m(vbool2_t mask, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_m(vbool2_t mask, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_m( +// CHECK-LABEL: @test_vcpop_v_u32mf2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> poison, <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_m(vbool64_t mask, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_m(vbool64_t mask, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_m( +// CHECK-LABEL: @test_vcpop_v_u32m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> poison, <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_m(vbool32_t mask, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_m(vbool32_t mask, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_m( +// CHECK-LABEL: @test_vcpop_v_u32m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> poison, <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_m(vbool16_t mask, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_m(vbool16_t mask, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_m( +// CHECK-LABEL: @test_vcpop_v_u32m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> poison, <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_m(vbool8_t mask, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_m(vbool8_t mask, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_m( +// CHECK-LABEL: @test_vcpop_v_u32m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> poison, <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_m(vbool4_t mask, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_m(vbool4_t mask, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_m( +// CHECK-LABEL: @test_vcpop_v_u64m1_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> poison, <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_m(vbool64_t mask, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_m(vbool64_t mask, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_m( +// CHECK-LABEL: @test_vcpop_v_u64m2_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> poison, <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_m(vbool32_t mask, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_m(vbool32_t mask, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_m( +// CHECK-LABEL: @test_vcpop_v_u64m4_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_m(vbool16_t mask, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_m(vbool16_t mask, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_m( +// CHECK-LABEL: @test_vcpop_v_u64m8_m( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> poison, <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 3) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_m(vbool8_t mask, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv(mask, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_m(vbool8_t mask, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop(mask, vs2, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vcpopv.c index 8a1f2e1..3a11033 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vcpopv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/non-overloaded/vcpopv.c @@ -16,795 +16,795 @@ #include <riscv_vector.h> -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tu(vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8_tu(maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tu(vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tu(vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4_tu(maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tu(vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tu(vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2_tu(maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tu(vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tu( +// CHECK-LABEL: @test_vcpop_v_u8m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tu(vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1_tu(maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tu(vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tu( +// CHECK-LABEL: @test_vcpop_v_u8m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tu(vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2_tu(maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tu(vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tu( +// CHECK-LABEL: @test_vcpop_v_u8m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tu(vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4_tu(maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tu(vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tu( +// CHECK-LABEL: @test_vcpop_v_u8m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tu(vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8_tu(maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tu(vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4_tu(maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2_tu(maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tu( +// CHECK-LABEL: @test_vcpop_v_u16m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tu(vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1_tu(maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tu(vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tu( +// CHECK-LABEL: @test_vcpop_v_u16m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tu(vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2_tu(maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tu(vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tu( +// CHECK-LABEL: @test_vcpop_v_u16m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tu(vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4_tu(maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tu(vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tu( +// CHECK-LABEL: @test_vcpop_v_u16m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tu(vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8_tu(maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tu(vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2_tu(maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tu( +// CHECK-LABEL: @test_vcpop_v_u32m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tu(vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1_tu(maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tu(vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tu( +// CHECK-LABEL: @test_vcpop_v_u32m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tu(vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2_tu(maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tu(vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tu( +// CHECK-LABEL: @test_vcpop_v_u32m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tu(vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4_tu(maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tu(vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tu( +// CHECK-LABEL: @test_vcpop_v_u32m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tu(vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8_tu(maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tu(vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tu( +// CHECK-LABEL: @test_vcpop_v_u64m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tu(vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1_tu(maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tu(vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tu( +// CHECK-LABEL: @test_vcpop_v_u64m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tu(vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2_tu(maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tu(vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tu( +// CHECK-LABEL: @test_vcpop_v_u64m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tu(vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4_tu(maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tu(vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tu( +// CHECK-LABEL: @test_vcpop_v_u64m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tu(vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8_tu(maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tu(vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tum(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8_tum(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tum(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tum(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4_tum(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tum(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tum(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2_tum(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tum(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tum( +// CHECK-LABEL: @test_vcpop_v_u8m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tum(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1_tum(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tum(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tum( +// CHECK-LABEL: @test_vcpop_v_u8m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tum(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2_tum(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tum(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tum( +// CHECK-LABEL: @test_vcpop_v_u8m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tum(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4_tum(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tum(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tum( +// CHECK-LABEL: @test_vcpop_v_u8m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tum(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8_tum(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tum(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tum( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4_tum(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2_tum(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tum( +// CHECK-LABEL: @test_vcpop_v_u16m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1_tum(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tum( +// CHECK-LABEL: @test_vcpop_v_u16m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2_tum(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tum( +// CHECK-LABEL: @test_vcpop_v_u16m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4_tum(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tum( +// CHECK-LABEL: @test_vcpop_v_u16m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8_tum(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2_tum(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tum( +// CHECK-LABEL: @test_vcpop_v_u32m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1_tum(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tum( +// CHECK-LABEL: @test_vcpop_v_u32m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2_tum(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tum( +// CHECK-LABEL: @test_vcpop_v_u32m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4_tum(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tum( +// CHECK-LABEL: @test_vcpop_v_u32m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8_tum(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tum( +// CHECK-LABEL: @test_vcpop_v_u64m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1_tum(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tum( +// CHECK-LABEL: @test_vcpop_v_u64m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2_tum(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tum( +// CHECK-LABEL: @test_vcpop_v_u64m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4_tum(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tum( +// CHECK-LABEL: @test_vcpop_v_u64m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8_tum(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tumu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8_tumu(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tumu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tumu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4_tumu(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tumu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tumu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2_tumu(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tumu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tumu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1_tumu(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tumu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tumu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2_tumu(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tumu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tumu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4_tumu(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tumu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tumu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8_tumu(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tumu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tumu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4_tumu(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2_tumu(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1_tumu(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2_tumu(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4_tumu(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8_tumu(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2_tumu(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1_tumu(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2_tumu(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4_tumu(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8_tumu(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1_tumu(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2_tumu(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4_tumu(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8_tumu(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_mu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf8_mu(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_mu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf8_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_mu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf4_mu(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_mu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_mu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8mf2_mu(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_mu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8mf2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_mu( +// CHECK-LABEL: @test_vcpop_v_u8m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_mu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m1_mu(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_mu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m1_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_mu( +// CHECK-LABEL: @test_vcpop_v_u8m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_mu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m2_mu(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_mu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_mu( +// CHECK-LABEL: @test_vcpop_v_u8m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_mu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m4_mu(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_mu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_mu( +// CHECK-LABEL: @test_vcpop_v_u8m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_mu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u8m8_mu(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_mu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u8m8_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_mu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf4_mu(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16mf2_mu(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16mf2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_mu( +// CHECK-LABEL: @test_vcpop_v_u16m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m1_mu(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m1_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_mu( +// CHECK-LABEL: @test_vcpop_v_u16m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m2_mu(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_mu( +// CHECK-LABEL: @test_vcpop_v_u16m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m4_mu(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_mu( +// CHECK-LABEL: @test_vcpop_v_u16m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u16m8_mu(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u16m8_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32mf2_mu(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32mf2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_mu( +// CHECK-LABEL: @test_vcpop_v_u32m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m1_mu(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m1_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_mu( +// CHECK-LABEL: @test_vcpop_v_u32m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m2_mu(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_mu( +// CHECK-LABEL: @test_vcpop_v_u32m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m4_mu(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_mu( +// CHECK-LABEL: @test_vcpop_v_u32m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u32m8_mu(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u32m8_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_mu( +// CHECK-LABEL: @test_vcpop_v_u64m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m1_mu(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m1_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_mu( +// CHECK-LABEL: @test_vcpop_v_u64m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m2_mu(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m2_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_mu( +// CHECK-LABEL: @test_vcpop_v_u64m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m4_mu(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m4_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_mu( +// CHECK-LABEL: @test_vcpop_v_u64m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_v_u64m8_mu(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_v_u64m8_mu(mask, maskedoff, vs2, vl); } diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vcpopv.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vcpopv.c index 02a499d..953ccac 100644 --- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vcpopv.c +++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/policy/overloaded/vcpopv.c @@ -16,795 +16,795 @@ #include <riscv_vector.h> -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tu(vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tu(vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tu(vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tu(vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tu(vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tu(vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tu( +// CHECK-LABEL: @test_vcpop_v_u8m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tu(vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tu(vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tu( +// CHECK-LABEL: @test_vcpop_v_u8m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tu(vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tu(vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tu( +// CHECK-LABEL: @test_vcpop_v_u8m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tu(vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tu(vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tu( +// CHECK-LABEL: @test_vcpop_v_u8m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tu(vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tu(vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tu(vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tu(vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tu( +// CHECK-LABEL: @test_vcpop_v_u16m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tu(vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tu(vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tu( +// CHECK-LABEL: @test_vcpop_v_u16m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tu(vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tu(vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tu( +// CHECK-LABEL: @test_vcpop_v_u16m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tu(vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tu(vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tu( +// CHECK-LABEL: @test_vcpop_v_u16m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tu(vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tu(vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tu(vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tu( +// CHECK-LABEL: @test_vcpop_v_u32m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tu(vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tu(vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tu( +// CHECK-LABEL: @test_vcpop_v_u32m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tu(vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tu(vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tu( +// CHECK-LABEL: @test_vcpop_v_u32m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tu(vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tu(vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tu( +// CHECK-LABEL: @test_vcpop_v_u32m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tu(vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tu(vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tu( +// CHECK-LABEL: @test_vcpop_v_u64m1_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tu(vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tu(vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tu( +// CHECK-LABEL: @test_vcpop_v_u64m2_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tu(vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tu(vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tu( +// CHECK-LABEL: @test_vcpop_v_u64m4_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tu(vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tu(vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tu( +// CHECK-LABEL: @test_vcpop_v_u64m8_tu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], i64 [[VL:%.*]]) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tu(vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_tu(maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tu(vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_tu(maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tum(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tum(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tum(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tum(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tum(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tum(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tum( +// CHECK-LABEL: @test_vcpop_v_u8m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tum(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tum(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tum( +// CHECK-LABEL: @test_vcpop_v_u8m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tum(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tum(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tum( +// CHECK-LABEL: @test_vcpop_v_u8m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tum(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tum(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tum( +// CHECK-LABEL: @test_vcpop_v_u8m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tum(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tum(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tum( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tum(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tum(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tum( +// CHECK-LABEL: @test_vcpop_v_u16m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tum(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tum( +// CHECK-LABEL: @test_vcpop_v_u16m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tum(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tum( +// CHECK-LABEL: @test_vcpop_v_u16m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tum(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tum( +// CHECK-LABEL: @test_vcpop_v_u16m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tum(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tum( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tum(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tum( +// CHECK-LABEL: @test_vcpop_v_u32m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tum(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tum( +// CHECK-LABEL: @test_vcpop_v_u32m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tum(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tum( +// CHECK-LABEL: @test_vcpop_v_u32m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tum(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tum( +// CHECK-LABEL: @test_vcpop_v_u32m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tum(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tum( +// CHECK-LABEL: @test_vcpop_v_u64m1_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tum(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tum( +// CHECK-LABEL: @test_vcpop_v_u64m2_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tum(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tum( +// CHECK-LABEL: @test_vcpop_v_u64m4_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tum(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tum( +// CHECK-LABEL: @test_vcpop_v_u64m8_tum( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 2) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_tum(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tum(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_tum(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_tumu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_tumu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_tumu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_tumu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_tumu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_tumu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_tumu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_tumu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_tumu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_tumu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_tumu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_tumu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u8m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_tumu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_tumu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_tumu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_tumu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_tumu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_tumu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_tumu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_tumu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u16m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_tumu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_tumu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_tumu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_tumu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_tumu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_tumu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u32m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_tumu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m1_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_tumu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m2_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_tumu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m4_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_tumu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_tumu( +// CHECK-LABEL: @test_vcpop_v_u64m8_tumu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 0) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_tumu(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_tumu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_tumu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf8_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i8> @llvm.riscv.vcpopv.mask.nxv1i8.i64(<vscale x 1 x i8> [[MASKEDOFF:%.*]], <vscale x 1 x i8> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i8> [[TMP0]] // -vuint8mf8_t test_vcpopv_v_u8mf8_mu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8mf8_t test_vcpop_v_u8mf8_mu(vbool64_t mask, vuint8mf8_t maskedoff, vuint8mf8_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf4_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i8> @llvm.riscv.vcpopv.mask.nxv2i8.i64(<vscale x 2 x i8> [[MASKEDOFF:%.*]], <vscale x 2 x i8> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i8> [[TMP0]] // -vuint8mf4_t test_vcpopv_v_u8mf4_mu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8mf4_t test_vcpop_v_u8mf4_mu(vbool32_t mask, vuint8mf4_t maskedoff, vuint8mf4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u8mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i8> @llvm.riscv.vcpopv.mask.nxv4i8.i64(<vscale x 4 x i8> [[MASKEDOFF:%.*]], <vscale x 4 x i8> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i8> [[TMP0]] // -vuint8mf2_t test_vcpopv_v_u8mf2_mu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8mf2_t test_vcpop_v_u8mf2_mu(vbool16_t mask, vuint8mf2_t maskedoff, vuint8mf2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m1_mu( +// CHECK-LABEL: @test_vcpop_v_u8m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i8> @llvm.riscv.vcpopv.mask.nxv8i8.i64(<vscale x 8 x i8> [[MASKEDOFF:%.*]], <vscale x 8 x i8> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i8> [[TMP0]] // -vuint8m1_t test_vcpopv_v_u8m1_mu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8m1_t test_vcpop_v_u8m1_mu(vbool8_t mask, vuint8m1_t maskedoff, vuint8m1_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m2_mu( +// CHECK-LABEL: @test_vcpop_v_u8m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.riscv.vcpopv.mask.nxv16i8.i64(<vscale x 16 x i8> [[MASKEDOFF:%.*]], <vscale x 16 x i8> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]] // -vuint8m2_t test_vcpopv_v_u8m2_mu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8m2_t test_vcpop_v_u8m2_mu(vbool4_t mask, vuint8m2_t maskedoff, vuint8m2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m4_mu( +// CHECK-LABEL: @test_vcpop_v_u8m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.riscv.vcpopv.mask.nxv32i8.i64(<vscale x 32 x i8> [[MASKEDOFF:%.*]], <vscale x 32 x i8> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 32 x i8> [[TMP0]] // -vuint8m4_t test_vcpopv_v_u8m4_mu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8m4_t test_vcpop_v_u8m4_mu(vbool2_t mask, vuint8m4_t maskedoff, vuint8m4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u8m8_mu( +// CHECK-LABEL: @test_vcpop_v_u8m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.vcpopv.mask.nxv64i8.i64(<vscale x 64 x i8> [[MASKEDOFF:%.*]], <vscale x 64 x i8> [[VS2:%.*]], <vscale x 64 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 64 x i8> [[TMP0]] // -vuint8m8_t test_vcpopv_v_u8m8_mu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint8m8_t test_vcpop_v_u8m8_mu(vbool1_t mask, vuint8m8_t maskedoff, vuint8m8_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf4_mu( +// CHECK-LABEL: @test_vcpop_v_u16mf4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i16> @llvm.riscv.vcpopv.mask.nxv1i16.i64(<vscale x 1 x i16> [[MASKEDOFF:%.*]], <vscale x 1 x i16> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i16> [[TMP0]] // -vuint16mf4_t test_vcpopv_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16mf4_t test_vcpop_v_u16mf4_mu(vbool64_t mask, vuint16mf4_t maskedoff, vuint16mf4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u16mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i16> @llvm.riscv.vcpopv.mask.nxv2i16.i64(<vscale x 2 x i16> [[MASKEDOFF:%.*]], <vscale x 2 x i16> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i16> [[TMP0]] // -vuint16mf2_t test_vcpopv_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16mf2_t test_vcpop_v_u16mf2_mu(vbool32_t mask, vuint16mf2_t maskedoff, vuint16mf2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m1_mu( +// CHECK-LABEL: @test_vcpop_v_u16m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i16> @llvm.riscv.vcpopv.mask.nxv4i16.i64(<vscale x 4 x i16> [[MASKEDOFF:%.*]], <vscale x 4 x i16> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i16> [[TMP0]] // -vuint16m1_t test_vcpopv_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16m1_t test_vcpop_v_u16m1_mu(vbool16_t mask, vuint16m1_t maskedoff, vuint16m1_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m2_mu( +// CHECK-LABEL: @test_vcpop_v_u16m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.riscv.vcpopv.mask.nxv8i16.i64(<vscale x 8 x i16> [[MASKEDOFF:%.*]], <vscale x 8 x i16> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i16> [[TMP0]] // -vuint16m2_t test_vcpopv_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16m2_t test_vcpop_v_u16m2_mu(vbool8_t mask, vuint16m2_t maskedoff, vuint16m2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m4_mu( +// CHECK-LABEL: @test_vcpop_v_u16m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.riscv.vcpopv.mask.nxv16i16.i64(<vscale x 16 x i16> [[MASKEDOFF:%.*]], <vscale x 16 x i16> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i16> [[TMP0]] // -vuint16m4_t test_vcpopv_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16m4_t test_vcpop_v_u16m4_mu(vbool4_t mask, vuint16m4_t maskedoff, vuint16m4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u16m8_mu( +// CHECK-LABEL: @test_vcpop_v_u16m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.vcpopv.mask.nxv32i16.i64(<vscale x 32 x i16> [[MASKEDOFF:%.*]], <vscale x 32 x i16> [[VS2:%.*]], <vscale x 32 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 32 x i16> [[TMP0]] // -vuint16m8_t test_vcpopv_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint16m8_t test_vcpop_v_u16m8_mu(vbool2_t mask, vuint16m8_t maskedoff, vuint16m8_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32mf2_mu( +// CHECK-LABEL: @test_vcpop_v_u32mf2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i32> @llvm.riscv.vcpopv.mask.nxv1i32.i64(<vscale x 1 x i32> [[MASKEDOFF:%.*]], <vscale x 1 x i32> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i32> [[TMP0]] // -vuint32mf2_t test_vcpopv_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint32mf2_t test_vcpop_v_u32mf2_mu(vbool64_t mask, vuint32mf2_t maskedoff, vuint32mf2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m1_mu( +// CHECK-LABEL: @test_vcpop_v_u32m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i32> @llvm.riscv.vcpopv.mask.nxv2i32.i64(<vscale x 2 x i32> [[MASKEDOFF:%.*]], <vscale x 2 x i32> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i32> [[TMP0]] // -vuint32m1_t test_vcpopv_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint32m1_t test_vcpop_v_u32m1_mu(vbool32_t mask, vuint32m1_t maskedoff, vuint32m1_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m2_mu( +// CHECK-LABEL: @test_vcpop_v_u32m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.riscv.vcpopv.mask.nxv4i32.i64(<vscale x 4 x i32> [[MASKEDOFF:%.*]], <vscale x 4 x i32> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i32> [[TMP0]] // -vuint32m2_t test_vcpopv_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint32m2_t test_vcpop_v_u32m2_mu(vbool16_t mask, vuint32m2_t maskedoff, vuint32m2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m4_mu( +// CHECK-LABEL: @test_vcpop_v_u32m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.riscv.vcpopv.mask.nxv8i32.i64(<vscale x 8 x i32> [[MASKEDOFF:%.*]], <vscale x 8 x i32> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i32> [[TMP0]] // -vuint32m4_t test_vcpopv_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint32m4_t test_vcpop_v_u32m4_mu(vbool8_t mask, vuint32m4_t maskedoff, vuint32m4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u32m8_mu( +// CHECK-LABEL: @test_vcpop_v_u32m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.vcpopv.mask.nxv16i32.i64(<vscale x 16 x i32> [[MASKEDOFF:%.*]], <vscale x 16 x i32> [[VS2:%.*]], <vscale x 16 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 16 x i32> [[TMP0]] // -vuint32m8_t test_vcpopv_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint32m8_t test_vcpop_v_u32m8_mu(vbool4_t mask, vuint32m8_t maskedoff, vuint32m8_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m1_mu( +// CHECK-LABEL: @test_vcpop_v_u64m1_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 1 x i64> @llvm.riscv.vcpopv.mask.nxv1i64.i64(<vscale x 1 x i64> [[MASKEDOFF:%.*]], <vscale x 1 x i64> [[VS2:%.*]], <vscale x 1 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 1 x i64> [[TMP0]] // -vuint64m1_t test_vcpopv_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint64m1_t test_vcpop_v_u64m1_mu(vbool64_t mask, vuint64m1_t maskedoff, vuint64m1_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m2_mu( +// CHECK-LABEL: @test_vcpop_v_u64m2_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.riscv.vcpopv.mask.nxv2i64.i64(<vscale x 2 x i64> [[MASKEDOFF:%.*]], <vscale x 2 x i64> [[VS2:%.*]], <vscale x 2 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 2 x i64> [[TMP0]] // -vuint64m2_t test_vcpopv_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint64m2_t test_vcpop_v_u64m2_mu(vbool32_t mask, vuint64m2_t maskedoff, vuint64m2_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m4_mu( +// CHECK-LABEL: @test_vcpop_v_u64m4_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.riscv.vcpopv.mask.nxv4i64.i64(<vscale x 4 x i64> [[MASKEDOFF:%.*]], <vscale x 4 x i64> [[VS2:%.*]], <vscale x 4 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 4 x i64> [[TMP0]] // -vuint64m4_t test_vcpopv_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint64m4_t test_vcpop_v_u64m4_mu(vbool16_t mask, vuint64m4_t maskedoff, vuint64m4_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } -// CHECK-LABEL: @test_vcpopv_v_u64m8_mu( +// CHECK-LABEL: @test_vcpop_v_u64m8_mu( // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.vcpopv.mask.nxv8i64.i64(<vscale x 8 x i64> [[MASKEDOFF:%.*]], <vscale x 8 x i64> [[VS2:%.*]], <vscale x 8 x i1> [[MASK:%.*]], i64 [[VL:%.*]], i64 1) // CHECK-NEXT: ret <vscale x 8 x i64> [[TMP0]] // -vuint64m8_t test_vcpopv_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { - return __riscv_vcpopv_mu(mask, maskedoff, vs2, vl); +vuint64m8_t test_vcpop_v_u64m8_mu(vbool8_t mask, vuint64m8_t maskedoff, vuint64m8_t vs2, size_t vl) { + return __riscv_vcpop_mu(mask, maskedoff, vs2, vl); } diff --git a/clang/test/CodeGen/voidptr-vaarg.c b/clang/test/CodeGen/voidptr-vaarg.c new file mode 100644 index 0000000..d023ddf --- /dev/null +++ b/clang/test/CodeGen/voidptr-vaarg.c @@ -0,0 +1,478 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: webassembly-registered-target +// RUN: %clang_cc1 -triple wasm32-unknown-unknown -emit-llvm -o - %s | FileCheck %s + +// Multiple targets use emitVoidPtrVAArg to lower va_arg instructions in clang +// PPC is complicated, excluding from this case analysis +// ForceRightAdjust is false for all non-PPC targets +// AllowHigherAlign is only false for two Microsoft targets, both of which +// pass most things by reference. +// +// Address emitVoidPtrVAArg(CodeGenFunction &CGF, Address VAListAddr, +// QualType ValueTy, bool IsIndirect, +// TypeInfoChars ValueInfo, CharUnits SlotSizeAndAlign, +// bool AllowHigherAlign, bool ForceRightAdjust = +// false); +// +// Target IsIndirect SlotSize AllowHigher ForceRightAdjust +// ARC false four true false +// ARM varies four true false +// Mips false 4 or 8 true false +// RISCV varies register true false +// PPC elided +// LoongArch varies register true false +// NVPTX WIP +// AMDGPU WIP +// X86_32 false four true false +// X86_64 MS varies eight false false +// CSKY false four true false +// Webassembly varies four true false +// AArch64 false eight true false +// AArch64 MS false eight false false +// +// Webassembly passes indirectly iff it's an aggregate of multiple values +// Choosing this as a representative architecture to check IR generation +// partly because it has a relatively simple variadic calling convention. + +// Int, by itself and packed in structs +// CHECK-LABEL: @raw_int( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: ret i32 [[TMP0]] +// +int raw_int(__builtin_va_list list) { return __builtin_va_arg(list, int); } + +typedef struct { + int x; +} one_int_t; + +// CHECK-LABEL: @one_int( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_ONE_INT_T:%.*]], align 4 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 4, i1 false) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_ONE_INT_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP0]] +// +one_int_t one_int(__builtin_va_list list) { + return __builtin_va_arg(list, one_int_t); +} + +typedef struct { + int x; + int y; +} two_int_t; + +// CHECK-LABEL: @two_int( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[AGG_RESULT:%.*]], ptr align 4 [[TMP0]], i32 8, i1 false) +// CHECK-NEXT: ret void +// +two_int_t two_int(__builtin_va_list list) { + return __builtin_va_arg(list, two_int_t); +} + +// Double, by itself and packed in structs +// CHECK-LABEL: @raw_double( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +// CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8) +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i32 8 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8 +// CHECK-NEXT: ret double [[TMP1]] +// +double raw_double(__builtin_va_list list) { + return __builtin_va_arg(list, double); +} + +typedef struct { + double x; +} one_double_t; + +// CHECK-LABEL: @one_double( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_ONE_DOUBLE_T:%.*]], align 8 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +// CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8) +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i32 8 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[RETVAL]], ptr align 8 [[ARGP_CUR_ALIGNED]], i32 8, i1 false) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_ONE_DOUBLE_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[COERCE_DIVE]], align 8 +// CHECK-NEXT: ret double [[TMP1]] +// +one_double_t one_double(__builtin_va_list list) { + return __builtin_va_arg(list, one_double_t); +} + +typedef struct { + double x; + double y; +} two_double_t; + +// CHECK-LABEL: @two_double( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[AGG_RESULT:%.*]], ptr align 8 [[TMP0]], i32 16, i1 false) +// CHECK-NEXT: ret void +// +two_double_t two_double(__builtin_va_list list) { + return __builtin_va_arg(list, two_double_t); +} + +// Scalar smaller than the slot size (C would promote a short to int) +typedef struct { + char x; +} one_char_t; + +// CHECK-LABEL: @one_char( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_ONE_CHAR_T:%.*]], align 1 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 1, i1 false) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_ONE_CHAR_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[COERCE_DIVE]], align 1 +// CHECK-NEXT: ret i8 [[TMP0]] +// +one_char_t one_char(__builtin_va_list list) { + return __builtin_va_arg(list, one_char_t); +} + +typedef struct { + short x; +} one_short_t; + +// CHECK-LABEL: @one_short( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_ONE_SHORT_T:%.*]], align 2 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 2, i1 false) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_ONE_SHORT_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[COERCE_DIVE]], align 2 +// CHECK-NEXT: ret i16 [[TMP0]] +// +one_short_t one_short(__builtin_va_list list) { + return __builtin_va_arg(list, one_short_t); +} + +// Composite smaller than the slot size +typedef struct { + _Alignas(2) char x; + char y; +} char_pair_t; + +// CHECK-LABEL: @char_pair( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[AGG_RESULT:%.*]], ptr align 2 [[TMP0]], i32 2, i1 false) +// CHECK-NEXT: ret void +// +char_pair_t char_pair(__builtin_va_list list) { + return __builtin_va_arg(list, char_pair_t); +} + +// Empty struct +typedef struct { +} empty_t; + +// CHECK-LABEL: @empty( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_EMPTY_T:%.*]], align 1 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 0 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 0, i1 false) +// CHECK-NEXT: ret void +// +empty_t empty(__builtin_va_list list) { + return __builtin_va_arg(list, empty_t); +} + +typedef struct { + empty_t x; + int y; +} empty_int_t; + +// CHECK-LABEL: @empty_int( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_EMPTY_INT_T:%.*]], align 4 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 4, i1 false) +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[RETVAL]], align 4 +// CHECK-NEXT: ret i32 [[TMP0]] +// +empty_int_t empty_int(__builtin_va_list list) { + return __builtin_va_arg(list, empty_int_t); +} + +typedef struct { + int x; + empty_t y; +} int_empty_t; + +// CHECK-LABEL: @int_empty( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_INT_EMPTY_T:%.*]], align 4 +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[RETVAL]], ptr align 4 [[ARGP_CUR]], i32 4, i1 false) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds [[STRUCT_INT_EMPTY_T]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP0]] +// +int_empty_t int_empty(__builtin_va_list list) { + return __builtin_va_arg(list, int_empty_t); +} + +// Need multiple va_arg instructions to check the postincrement +// Using types that are passed directly as the indirect handling +// is independent of the alignment handling in emitVoidPtrDirectVAArg. + +// CHECK-LABEL: @multiple_int( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT0_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT0:%.*]], ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT1:%.*]], ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[TMP1]], align 4 +// CHECK-NEXT: [[ARGP_CUR1:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT2:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR1]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT2]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGP_CUR1]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: [[ARGP_CUR3:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT4:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR3]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT4]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[ARGP_CUR3]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP4]], ptr [[TMP5]], align 4 +// CHECK-NEXT: ret void +// +void multiple_int(__builtin_va_list list, int *out0, int *out1, int *out2) { + *out0 = __builtin_va_arg(list, int); + *out1 = __builtin_va_arg(list, int); + *out2 = __builtin_va_arg(list, int); +} + +// Scalars in structs are an easy way of specifying alignment from C +// CHECK-LABEL: @increasing_alignment( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT0_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT0:%.*]], ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT1:%.*]], ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TMP0]], ptr align 4 [[ARGP_CUR]], i32 1, i1 false) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR1:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT2:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR1]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT2]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[TMP1]], ptr align 4 [[ARGP_CUR1]], i32 2, i1 false) +// CHECK-NEXT: [[ARGP_CUR3:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT4:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR3]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT4]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[ARGP_CUR3]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 +// CHECK-NEXT: [[ARGP_CUR5:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR5]], i32 7 +// CHECK-NEXT: [[ARGP_CUR5_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP4]], i32 -8) +// CHECK-NEXT: [[ARGP_NEXT6:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR5_ALIGNED]], i32 8 +// CHECK-NEXT: store ptr [[ARGP_NEXT6]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARGP_CUR5_ALIGNED]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[OUT3_ADDR]], align 4 +// CHECK-NEXT: store double [[TMP5]], ptr [[TMP6]], align 8 +// CHECK-NEXT: ret void +// +void increasing_alignment(__builtin_va_list list, one_char_t *out0, + one_short_t *out1, int *out2, double *out3) { + *out0 = __builtin_va_arg(list, one_char_t); + *out1 = __builtin_va_arg(list, one_short_t); + *out2 = __builtin_va_arg(list, int); + *out3 = __builtin_va_arg(list, double); +} + +// CHECK-LABEL: @decreasing_alignment( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT0_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT1_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT2_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT3_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT0:%.*]], ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT1:%.*]], ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT2:%.*]], ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT3:%.*]], ptr [[OUT3_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 7 +// CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -8) +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i32 8 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARGP_CUR_ALIGNED]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT0_ADDR]], align 4 +// CHECK-NEXT: store double [[TMP1]], ptr [[TMP2]], align 8 +// CHECK-NEXT: [[ARGP_CUR1:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT2:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR1]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT2]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ARGP_CUR1]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[OUT1_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[OUT2_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR3:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT4:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR3]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT4]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 2 [[TMP5]], ptr align 4 [[ARGP_CUR3]], i32 2, i1 false) +// CHECK-NEXT: [[TMP6:%.*]] = load ptr, ptr [[OUT3_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR5:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT6:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR5]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT6]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[TMP6]], ptr align 4 [[ARGP_CUR5]], i32 1, i1 false) +// CHECK-NEXT: ret void +// +void decreasing_alignment(__builtin_va_list list, double *out0, int *out1, + one_short_t *out2, one_char_t *out3) { + *out0 = __builtin_va_arg(list, double); + *out1 = __builtin_va_arg(list, int); + *out2 = __builtin_va_arg(list, one_short_t); + *out3 = __builtin_va_arg(list, one_char_t); +} + +// Typical edge cases, none hit special handling in VAArg lowering. +typedef struct { + int x[16]; + double y[8]; +} large_value_t; + +// CHECK-LABEL: @large_value( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[TMP0]], ptr align 8 [[TMP1]], i32 128, i1 false) +// CHECK-NEXT: ret void +// +void large_value(__builtin_va_list list, large_value_t *out) { + *out = __builtin_va_arg(list, large_value_t); +} + +typedef int v128_t __attribute__((__vector_size__(16), __aligned__(16))); +// CHECK-LABEL: @vector( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 15 +// CHECK-NEXT: [[ARGP_CUR_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[TMP0]], i32 -16) +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR_ALIGNED]], i32 16 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARGP_CUR_ALIGNED]], align 16 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[TMP2]], align 16 +// CHECK-NEXT: ret void +// +void vector(__builtin_va_list list, v128_t *out) { + *out = __builtin_va_arg(list, v128_t); +} + +typedef struct BF { + float not_an_i32[2]; + int A : 1; + char B; + int C : 13; +} BF; + +// CHECK-LABEL: @bitfield( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[LIST_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr, align 4 +// CHECK-NEXT: store ptr [[LIST:%.*]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: store ptr [[OUT:%.*]], ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[OUT_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_CUR:%.*]] = load ptr, ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[ARGP_NEXT:%.*]] = getelementptr inbounds i8, ptr [[ARGP_CUR]], i32 4 +// CHECK-NEXT: store ptr [[ARGP_NEXT]], ptr [[LIST_ADDR]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[ARGP_CUR]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP0]], ptr align 4 [[TMP1]], i32 12, i1 false) +// CHECK-NEXT: ret void +// +void bitfield(__builtin_va_list list, BF *out) { + *out = __builtin_va_arg(list, BF); +} diff --git a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu index ba5e5f1..7880a80 100644 --- a/clang/test/CodeGenCUDA/cuda-builtin-vars.cu +++ b/clang/test/CodeGenCUDA/cuda-builtin-vars.cu @@ -6,21 +6,21 @@ __attribute__((global)) void kernel(int *out) { int i = 0; - out[i++] = threadIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.x() - out[i++] = threadIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.y() - out[i++] = threadIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.tid.z() + out[i++] = threadIdx.x; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.x() + out[i++] = threadIdx.y; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.y() + out[i++] = threadIdx.z; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.tid.z() - out[i++] = blockIdx.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() - out[i++] = blockIdx.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() - out[i++] = blockIdx.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() + out[i++] = blockIdx.x; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() + out[i++] = blockIdx.y; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() + out[i++] = blockIdx.z; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() - out[i++] = blockDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.x() - out[i++] = blockDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.y() - out[i++] = blockDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + out[i++] = blockDim.x; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + out[i++] = blockDim.y; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + out[i++] = blockDim.z; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.ntid.z() - out[i++] = gridDim.x; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() - out[i++] = gridDim.y; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() - out[i++] = gridDim.z; // CHECK: call noundef i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() + out[i++] = gridDim.x; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() + out[i++] = gridDim.y; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() + out[i++] = gridDim.z; // CHECK: call noundef{{.*}} i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() out[i++] = warpSize; // CHECK: store i32 32, diff --git a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp new file mode 100644 index 0000000..a0673b9 --- /dev/null +++ b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp @@ -0,0 +1,181 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature +// REQUIRES: webassembly-registered-target + +// Simple calls to known variadic functions that are completely elided when +// optimisations are on This is a functional check that the expand-variadic pass +// is consistent with clang's va_arg handling + +// When expand-variadics is added to the default pipeline, clang -O1 will +// suffice here -Wno-varargs avoids warning second argument to 'va_start' is not +// the last named parameter + +// RUN: %clang_cc1 %s -triple wasm32-unknown-unknown -Wno-varargs -O1 -emit-llvm -o - | opt - -S --passes='module(expand-variadics,default<O1>)' --expand-variadics-override=optimize -o - | FileCheck %s + +#include <stdarg.h> +#include <stdint.h> + +template <typename X, typename Y> static X first(...) { + va_list va; + __builtin_va_start(va, 0); + X r = va_arg(va, X); + va_end(va); + return r; +} + +template <typename X, typename Y> static Y second(...) { + va_list va; + __builtin_va_start(va, 0); + va_arg(va, X); + Y r = va_arg(va, Y); + va_end(va); + return r; +} + +extern "C" { + +// CHECK-LABEL: define {{[^@]+}}@first_pair_i32 +// CHECK-SAME: (i32 noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[X]] +// +int first_pair_i32(int x, int y) { return first<int, int>(x, y); } + +// CHECK-LABEL: define {{[^@]+}}@second_pair_i32 +// CHECK-SAME: (i32 noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[Y]] +// +int second_pair_i32(int x, int y) { return second<int, int>(x, y); } + +// CHECK-LABEL: define {{[^@]+}}@first_pair_f64 +// CHECK-SAME: (double noundef returned [[X:%.*]], double noundef [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret double [[X]] +// +double first_pair_f64(double x, double y) { + return first<double, double>(x, y); +} + +// CHECK-LABEL: define {{[^@]+}}@second_pair_f64 +// CHECK-SAME: (double noundef [[X:%.*]], double noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret double [[Y]] +// +double second_pair_f64(double x, double y) { + return second<double, double>(x, y); +} +} + +extern "C" { + +// CHECK-LABEL: define {{[^@]+}}@first_i32_f64 +// CHECK-SAME: (i32 noundef returned [[X:%.*]], double noundef [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[X]] +// +int first_i32_f64(int x, double y) { return first<int, double>(x, y); } + +// CHECK-LABEL: define {{[^@]+}}@second_i32_f64 +// CHECK-SAME: (i32 noundef [[X:%.*]], double noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret double [[Y]] +// +double second_i32_f64(int x, double y) { return second<int, double>(x, y); } + +// CHECK-LABEL: define {{[^@]+}}@first_f64_i32 +// CHECK-SAME: (double noundef returned [[X:%.*]], i32 noundef [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret double [[X]] +// +double first_f64_i32(double x, int y) { return first<double, int>(x, y); } + +// CHECK-LABEL: define {{[^@]+}}@second_f64_i32 +// CHECK-SAME: (double noundef [[X:%.*]], i32 noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[Y]] +// +int second_f64_i32(double x, int y) { return second<double, int>(x, y); } +} + +extern "C" { +typedef uint64_t ulong2 __attribute__((__vector_size__(16), __aligned__(16))); + +// CHECK-LABEL: define {{[^@]+}}@first_i32_ulong2 +// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[X]] +// +int first_i32_ulong2(int x, ulong2 *y) { return first<int, ulong2>(x, *y); } + +// CHECK-LABEL: define {{[^@]+}}@second_i32_ulong2 +// CHECK-SAME: (i32 noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: ret void +// +void second_i32_ulong2(int x, ulong2 *y, ulong2 *r) { + *r = second<int, ulong2>(x, *y); +} + +// CHECK-LABEL: define {{[^@]+}}@first_ulong2_i32 +// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[R]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: ret void +// +void first_ulong2_i32(ulong2 *x, int y, ulong2 *r) { + *r = first<ulong2, int>(*x, y); +} + +// CHECK-LABEL: define {{[^@]+}}@second_ulong2_i32 +// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[Y]] +// +int second_ulong2_i32(ulong2 *x, int y) { return second<ulong2, int>(*x, y); } +} + +// ascending alignment +typedef struct { + char c; + short s; + int i; + long l; + float f; + double d; +} asc; + +extern "C" { + +// CHECK-LABEL: define {{[^@]+}}@first_i32_asc +// CHECK-SAME: (i32 noundef returned [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[X]] +// +int first_i32_asc(int x, asc *y) { return first<int, asc>(x, *y); } + +// CHECK-LABEL: define {{[^@]+}}@second_i32_asc +// CHECK-SAME: (i32 noundef [[X:%.*]], ptr nocapture noundef readonly [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[Y]], i32 24, i1 false) +// CHECK-NEXT: ret void +// +void second_i32_asc(int x, asc *y, asc *r) { *r = second<int, asc>(x, *y); } + +// CHECK-LABEL: define {{[^@]+}}@first_asc_i32 +// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef [[Y:%.*]], ptr nocapture noundef writeonly [[R:%.*]]) local_unnamed_addr #[[ATTR1]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.memmove.p0.p0.i32(ptr noundef nonnull align 8 dereferenceable(24) [[R]], ptr noundef nonnull align 1 dereferenceable(24) [[X]], i32 24, i1 false) +// CHECK-NEXT: ret void +// +void first_asc_i32(asc *x, int y, asc *r) { *r = first<asc, int>(*x, y); } + +// CHECK-LABEL: define {{[^@]+}}@second_asc_i32 +// CHECK-SAME: (ptr nocapture noundef readonly [[X:%.*]], i32 noundef returned [[Y:%.*]]) +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 [[Y]] +// +int second_asc_i32(asc *x, int y) { return second<asc, int>(*x, y); } +} diff --git a/clang/test/CodeGenCXX/pointers-to-data-members.cpp b/clang/test/CodeGenCXX/pointers-to-data-members.cpp index 29f1c3f..cf1d6c0 100644 --- a/clang/test/CodeGenCXX/pointers-to-data-members.cpp +++ b/clang/test/CodeGenCXX/pointers-to-data-members.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 %s -emit-llvm -o %t.ll -triple=x86_64-apple-darwin10 +// RUN: %clang_cc1 %s -emit-llvm -o %t.ll -triple=x86_64-apple-darwin10 -fexperimental-new-constant-interpreter // RUN: FileCheck %s < %t.ll // RUN: FileCheck -check-prefix=CHECK-GLOBAL %s < %t.ll diff --git a/clang/test/CodeGenCXX/template-param-objects-linkage.cpp b/clang/test/CodeGenCXX/template-param-objects-linkage.cpp index 63e7d8c..9c148ed 100644 --- a/clang/test/CodeGenCXX/template-param-objects-linkage.cpp +++ b/clang/test/CodeGenCXX/template-param-objects-linkage.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++20 %s -emit-llvm -o - | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++20 %s -emit-llvm -o - -fexperimental-new-constant-interpreter | FileCheck %s struct S { char buf[32]; }; template<S s> constexpr const char* f() { return s.buf; } diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 2fda52d..854ab39 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -49,6 +49,7 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1103 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1150 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1151 %s +// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1152 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1200 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1201 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1201 %s @@ -100,6 +101,7 @@ // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" +// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32" diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl index d17ff81..6606178 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl @@ -5,6 +5,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1103 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1150 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1151 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1152 -emit-llvm -o - %s | FileCheck %s typedef unsigned int uint; typedef unsigned long ulong; diff --git a/clang/test/Driver/aarch64-oryon-1.c b/clang/test/Driver/aarch64-oryon-1.c new file mode 100644 index 0000000..952ba5d --- /dev/null +++ b/clang/test/Driver/aarch64-oryon-1.c @@ -0,0 +1,19 @@ +// RUN: %clang -target aarch64 -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix %s +// RUN: %clang -target aarch64 -mlittle-endian -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix %s +// RUN: %clang -target aarch64_be -mlittle-endian -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix %s +// RUN: %clang -target aarch64 -mtune=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix-TUNE %s +// RUN: %clang -target aarch64 -mlittle-endian -mtune=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix-TUNE %s +// RUN: %clang -target aarch64_be -mlittle-endian -mtune=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=Phoenix-TUNE %s +// Phoenix: "-cc1"{{.*}} "-triple" "aarch64{{(--)?}}"{{.*}} "-target-cpu" "oryon-1" "-target-feature" "+v8.6a" +// Phoenix-TUNE: "-cc1"{{.*}} "-triple" "aarch64{{(--)?}}"{{.*}} "-target-cpu" "generic" + +// RUN: %clang -target arm64 -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-Phoenix %s +// RUN: %clang -target arm64 -mlittle-endian -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-Phoenix %s +// RUN: %clang -target arm64 -mtune=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-Phoenix-TUNE %s +// RUN: %clang -target arm64 -mlittle-endian -mtune=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=ARM64-Phoenix-TUNE %s +// ARM64-Phoenix: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "oryon-1" "-target-feature" "+v8.6a" +// ARM64-Phoenix-TUNE: "-cc1"{{.*}} "-triple" "arm64{{.*}}" "-target-cpu" "generic" + +// RUN: %clang -target aarch64 -mcpu=oryon-1 -mtune=cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE-Phoenix %s +// RUN: %clang -target aarch64 -mtune=cortex-a53 -mcpu=oryon-1 -### -c %s 2>&1 | FileCheck -check-prefix=MCPU-MTUNE-Phoenix %s +// MCPU-MTUNE-Phoenix: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "oryon-1" diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index a878a7d..3e4a570 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -127,6 +127,7 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1103 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1103 -DFAMILY=GFX11 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1150 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1150 -DFAMILY=GFX11 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1151 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1151 -DFAMILY=GFX11 +// RUN: %clang -E -dM -target amdgcn -mcpu=gfx1152 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1152 -DFAMILY=GFX11 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1200 -DFAMILY=GFX12 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1201 -DFAMILY=GFX12 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index 5b6a220..4b0ef92 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -112,6 +112,7 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx1103 %s 2>&1 | FileCheck --check-prefix=GFX1103 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1150 %s 2>&1 | FileCheck --check-prefix=GFX1150 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1151 %s 2>&1 | FileCheck --check-prefix=GFX1151 %s +// RUN: %clang -### -target amdgcn -mcpu=gfx1152 %s 2>&1 | FileCheck --check-prefix=GFX1152 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX1200 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1201 %s 2>&1 | FileCheck --check-prefix=GFX1201 %s @@ -164,6 +165,7 @@ // GFX1103: "-target-cpu" "gfx1103" // GFX1150: "-target-cpu" "gfx1150" // GFX1151: "-target-cpu" "gfx1151" +// GFX1152: "-target-cpu" "gfx1152" // GFX1200: "-target-cpu" "gfx1200" // GFX1201: "-target-cpu" "gfx1201" diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c new file mode 100644 index 0000000..f6158ad --- /dev/null +++ b/clang/test/Interpreter/pretty-print.c @@ -0,0 +1,8 @@ +// REQUIRES: host-supports-jit +// UNSUPPORTED: system-aix +// RUN: cat %s | clang-repl -Xcc -xc | FileCheck %s +// RUN: cat %s | clang-repl -Xcc -std=c++11 | FileCheck %s + +const char* c_str = "Hello, world!"; c_str + +// CHECK: Not implement yet. diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c index bad1374..cb5b675 100644 --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -5,11 +5,11 @@ // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64 // AARCH64: error: unknown target CPU 'not-a-cpu' -// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} +// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, oryon-1, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu' -// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, cobalt-100, grace{{$}} +// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a520, cortex-a520ae, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78ae, cortex-a78c, cortex-a710, cortex-a715, cortex-a720, cortex-a720ae, cortex-r82, cortex-r82ae, cortex-x1, cortex-x1c, cortex-x2, cortex-x3, cortex-x4, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-n3, neoverse-512tvb, neoverse-v1, neoverse-v2, neoverse-v3, neoverse-v3ae, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-a15, apple-a16, apple-a17, apple-m1, apple-m2, apple-m3, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1, ampere1a, ampere1b, oryon-1, cobalt-100, grace{{$}} // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86 // X86: error: unknown target CPU 'not-a-cpu' @@ -29,7 +29,7 @@ // RUN: not %clang_cc1 -triple nvptx--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix NVPTX // NVPTX: error: unknown target CPU 'not-a-cpu' -// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx12-generic, gfx1200, gfx1201{{$}} +// NVPTX-NEXT: note: valid target CPU values are: sm_20, sm_21, sm_30, sm_32, sm_35, sm_37, sm_50, sm_52, sm_53, sm_60, sm_61, sm_62, sm_70, sm_72, sm_75, sm_80, sm_86, sm_87, sm_89, sm_90, sm_90a, gfx600, gfx601, gfx602, gfx700, gfx701, gfx702, gfx703, gfx704, gfx705, gfx801, gfx802, gfx803, gfx805, gfx810, gfx9-generic, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx10-1-generic, gfx1010, gfx1011, gfx1012, gfx1013, gfx10-3-generic, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx11-generic, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx12-generic, gfx1200, gfx1201{{$}} // RUN: not %clang_cc1 -triple r600--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix R600 // R600: error: unknown target CPU 'not-a-cpu' @@ -37,7 +37,7 @@ // RUN: not %clang_cc1 -triple amdgcn--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AMDGCN // AMDGCN: error: unknown target CPU 'not-a-cpu' -// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1200, gfx1201, gfx9-generic, gfx10-1-generic, gfx10-3-generic, gfx11-generic, gfx12-generic{{$}} +// AMDGCN-NEXT: note: valid target CPU values are: gfx600, tahiti, gfx601, pitcairn, verde, gfx602, hainan, oland, gfx700, kaveri, gfx701, hawaii, gfx702, gfx703, kabini, mullins, gfx704, bonaire, gfx705, gfx801, carrizo, gfx802, iceland, tonga, gfx803, fiji, polaris10, polaris11, gfx805, tongapro, gfx810, stoney, gfx900, gfx902, gfx904, gfx906, gfx908, gfx909, gfx90a, gfx90c, gfx940, gfx941, gfx942, gfx1010, gfx1011, gfx1012, gfx1013, gfx1030, gfx1031, gfx1032, gfx1033, gfx1034, gfx1035, gfx1036, gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152, gfx1200, gfx1201, gfx9-generic, gfx10-1-generic, gfx10-3-generic, gfx11-generic, gfx12-generic{{$}} // RUN: not %clang_cc1 -triple wasm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix WEBASM // WEBASM: error: unknown target CPU 'not-a-cpu' diff --git a/clang/test/SemaCXX/attr-weak.cpp b/clang/test/SemaCXX/attr-weak.cpp index f065bfd..0f9a297 100644 --- a/clang/test/SemaCXX/attr-weak.cpp +++ b/clang/test/SemaCXX/attr-weak.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fsyntax-only -verify -std=c++11 %s +// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fsyntax-only -verify -std=c++11 %s -fexperimental-new-constant-interpreter static int test0 __attribute__((weak)); // expected-error {{weak declaration cannot have internal linkage}} static void test1() __attribute__((weak)); // expected-error {{weak declaration cannot have internal linkage}} diff --git a/clang/test/SemaCXX/builtin-is-bitwise-cloneable-fsanitize.cpp b/clang/test/SemaCXX/builtin-is-bitwise-cloneable-fsanitize.cpp new file mode 100644 index 0000000..d47a39a --- /dev/null +++ b/clang/test/SemaCXX/builtin-is-bitwise-cloneable-fsanitize.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux -DSANITIZER_ENABLED -fsanitize=address -fsanitize-address-field-padding=1 %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux %s + +struct S { + ~S() {} + virtual void foo() {} + + int buffer[1]; + int other_field = 0; +}; + +union U { + S s; +}; + +struct Derived : S {}; + +static_assert(!__is_trivially_copyable(S)); +#ifdef SANITIZER_ENABLED +// Don't allow memcpy when the struct has poisoned padding bits. +// The sanitizer adds posion padding bits to struct S. +static_assert(sizeof(S) > 16); +static_assert(!__is_bitwise_cloneable(S)); +static_assert(sizeof(U) == sizeof(S)); // no padding bit for U. +static_assert(!__is_bitwise_cloneable(U)); +static_assert(!__is_bitwise_cloneable(S[2])); +static_assert(!__is_bitwise_cloneable(Derived)); +#else +static_assert(sizeof(S) == 16); +static_assert(__is_bitwise_cloneable(S)); +static_assert(__is_bitwise_cloneable(U)); +static_assert(__is_bitwise_cloneable(S[2])); +static_assert(__is_bitwise_cloneable(Derived)); +#endif diff --git a/clang/test/SemaCXX/builtin-is-bitwise-cloneable.cpp b/clang/test/SemaCXX/builtin-is-bitwise-cloneable.cpp new file mode 100644 index 0000000..1781cf4 --- /dev/null +++ b/clang/test/SemaCXX/builtin-is-bitwise-cloneable.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s +// +struct DynamicClass { virtual int Foo(); }; +static_assert(!__is_trivially_copyable(DynamicClass)); +static_assert(__is_bitwise_cloneable(DynamicClass)); + +struct InComplete; // expected-note{{forward declaration}} +static_assert(!__is_bitwise_cloneable(InComplete)); // expected-error{{incomplete type 'InComplete' used in type trait expression}} diff --git a/clang/test/SemaCXX/constexpr-default-arg.cpp b/clang/test/SemaCXX/constexpr-default-arg.cpp index 901123b..ec9b292 100644 --- a/clang/test/SemaCXX/constexpr-default-arg.cpp +++ b/clang/test/SemaCXX/constexpr-default-arg.cpp @@ -32,8 +32,8 @@ void test_default_arg2() { } // Check that multiple CXXDefaultInitExprs don't cause an assertion failure. -struct A { int &&r = 0; }; +struct A { int &&r = 0; }; // expected-note 2{{default member initializer}} struct B { A x, y; }; -B b = {}; // expected-no-diagnostics +B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} } diff --git a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp index 1ea8b98..dd8e9c6 100644 --- a/clang/test/SemaCXX/cxx11-default-member-initializers.cpp +++ b/clang/test/SemaCXX/cxx11-default-member-initializers.cpp @@ -27,80 +27,6 @@ class MemInit { C m = s; }; -namespace std { -typedef decltype(sizeof(int)) size_t; - -// libc++'s implementation -template <class _E> class initializer_list { - const _E *__begin_; - size_t __size_; - - initializer_list(const _E *__b, size_t __s) : __begin_(__b), __size_(__s) {} - -public: - typedef _E value_type; - typedef const _E &reference; - typedef const _E &const_reference; - typedef size_t size_type; - - typedef const _E *iterator; - typedef const _E *const_iterator; - - initializer_list() : __begin_(nullptr), __size_(0) {} - - size_t size() const { return __size_; } - const _E *begin() const { return __begin_; } - const _E *end() const { return __begin_ + __size_; } -}; -} // namespace std - -#if __cplusplus >= 201703L -namespace test_rebuild { -template <typename T, int> class C { -public: - C(std::initializer_list<T>); -}; - -template <typename T> using Ptr = __remove_pointer(T) *; -template <typename T> C(T) -> C<Ptr<T>, sizeof(T)>; - -class A { -public: - template <typename T1, typename T2> T1 *some_func(T2 &&); -}; - -struct B : A { - // Test CXXDefaultInitExpr rebuild issue in - // https://github.com/llvm/llvm-project/pull/87933 - int *ar = some_func<int>(C{some_func<int>(0)}); - B() {} -}; - -int TestBody_got; -template <int> class Vector { -public: - Vector(std::initializer_list<int>); -}; -template <typename... Ts> Vector(Ts...) -> Vector<sizeof...(Ts)>; -class ProgramBuilder { -public: - template <typename T, typename ARGS> int *create(ARGS); -}; - -struct TypeTest : ProgramBuilder { - int *str_f16 = create<int>(Vector{0}); - TypeTest() {} -}; -class TypeTest_Element_Test : TypeTest { - void TestBody(); -}; -void TypeTest_Element_Test::TestBody() { - int *expect = str_f16; - &TestBody_got != expect; // expected-warning {{inequality comparison result unused}} -} -} // namespace test_rebuild -#endif // __cplusplus >= 201703L - #if __cplusplus >= 202002L // This test ensures cleanup expressions are correctly produced // in the presence of default member initializers. diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp index a06f60f..017df97 100644 --- a/clang/test/SemaCXX/eval-crashes.cpp +++ b/clang/test/SemaCXX/eval-crashes.cpp @@ -25,9 +25,11 @@ namespace pr33140_0b { } namespace pr33140_2 { - struct A { int &&r = 0; }; + // FIXME: The declaration of 'b' below should lifetime-extend two int + // temporaries. + struct A { int &&r = 0; }; // expected-note 2{{initializing field 'r' with default member initializer}} struct B { A x, y; }; - B b = {}; + B b = {}; // expected-warning 2{{lifetime extension of temporary created by aggregate initialization using a default member initializer is not yet supported}} } namespace pr33140_3 { diff --git a/clang/test/SemaCXX/nullptr_in_arithmetic_ops.cpp b/clang/test/SemaCXX/nullptr_in_arithmetic_ops.cpp index 6273d9c..98bec18 100644 --- a/clang/test/SemaCXX/nullptr_in_arithmetic_ops.cpp +++ b/clang/test/SemaCXX/nullptr_in_arithmetic_ops.cpp @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -fsyntax-only -Wno-tautological-pointer-compare -fblocks -std=c++11 -verify %s +// RUN: %clang_cc1 -fsyntax-only -Wno-tautological-pointer-compare -fblocks -std=c++11 -verify %s -fexperimental-new-constant-interpreter void foo() { int a; diff --git a/clang/test/SemaObjCXX/arc-type-traits.mm b/clang/test/SemaObjCXX/arc-type-traits.mm index 2d30ae4..25bc8b36 100644 --- a/clang/test/SemaObjCXX/arc-type-traits.mm +++ b/clang/test/SemaObjCXX/arc-type-traits.mm @@ -221,3 +221,12 @@ TRAIT_IS_TRUE(__is_trivially_relocatable, __unsafe_unretained id); TRAIT_IS_TRUE(__is_trivially_relocatable, HasStrong); TRAIT_IS_FALSE(__is_trivially_relocatable, HasWeak); TRAIT_IS_TRUE(__is_trivially_relocatable, HasUnsafeUnretained); + +// __is_bitwise_cloneable +TRAIT_IS_FALSE(__is_bitwise_cloneable, __strong id); +TRAIT_IS_FALSE(__is_bitwise_cloneable, __weak id); +TRAIT_IS_FALSE(__is_bitwise_cloneable, __autoreleasing id); +TRAIT_IS_TRUE(__is_trivial, __unsafe_unretained id); +TRAIT_IS_FALSE(__is_bitwise_cloneable, HasStrong); +TRAIT_IS_FALSE(__is_bitwise_cloneable, HasWeak); +TRAIT_IS_TRUE(__is_bitwise_cloneable, HasUnsafeUnretained); diff --git a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c index 23f852e..ac61976 100644 --- a/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c +++ b/clang/test/SemaOpenACC/loop-construct-auto_seq_independent-clauses.c @@ -106,7 +106,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop auto present(Var) for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop auto private(Var) for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} @@ -246,7 +245,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop present(Var) auto for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop private(Var) auto for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} @@ -387,7 +385,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop independent present(Var) for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop independent private(Var) for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} @@ -527,7 +524,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop present(Var) independent for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop private(Var) independent for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} @@ -677,7 +673,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop seq present(Var) for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop seq private(Var) for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} @@ -826,7 +821,6 @@ void uses() { // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}} #pragma acc loop present(Var) seq for(;;); - // expected-warning@+1{{OpenACC clause 'private' not yet implemented}} #pragma acc loop private(Var) seq for(;;); // expected-error@+1{{OpenACC 'copyout' clause is not valid on 'loop' directive}} diff --git a/clang/test/SemaOpenACC/loop-construct-private-clause.c b/clang/test/SemaOpenACC/loop-construct-private-clause.c new file mode 100644 index 0000000..f3ffdfb --- /dev/null +++ b/clang/test/SemaOpenACC/loop-construct-private-clause.c @@ -0,0 +1,132 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +struct Incomplete; +enum SomeE{ A }; +typedef struct IsComplete { + struct S { int A; } CompositeMember; + int ScalarMember; + float ArrayMember[5]; + enum SomeE EnumMember; + void *PointerMember; +} Complete; + +int GlobalInt; +float GlobalArray[5]; +short *GlobalPointer; +Complete GlobalComposite; + +void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) { + int LocalInt; + short *LocalPointer; + float LocalArray[5]; + Complete LocalComposite; + + // Check Appertainment: +#pragma acc loop private(LocalInt) + for(;;); + + // Valid cases: +#pragma acc loop private(LocalInt, LocalPointer, LocalArray) + for(;;); +#pragma acc loop private(LocalArray) + for(;;); +#pragma acc loop private(LocalArray[:]) + for(;;); +#pragma acc loop private(LocalArray[:5]) + for(;;); +#pragma acc loop private(LocalArray[2:]) + for(;;); +#pragma acc loop private(LocalArray[2:1]) + for(;;); +#pragma acc loop private(LocalArray[2]) + for(;;); +#pragma acc loop private(LocalComposite) + for(;;); +#pragma acc loop private(LocalComposite.EnumMember) + for(;;); +#pragma acc loop private(LocalComposite.ScalarMember) + for(;;); +#pragma acc loop private(LocalComposite.ArrayMember) + for(;;); +#pragma acc loop private(LocalComposite.ArrayMember[5]) + for(;;); +#pragma acc loop private(LocalComposite.PointerMember) + for(;;); +#pragma acc loop private(GlobalInt, GlobalArray, GlobalPointer, GlobalComposite) + for(;;); +#pragma acc loop private(GlobalArray[2], GlobalPointer[2], GlobalComposite.CompositeMember.A) + for(;;); +#pragma acc loop private(LocalComposite, GlobalComposite) + for(;;); +#pragma acc loop private(IntParam, PointerParam, ArrayParam, CompositeParam) + for(;;); +#pragma acc loop private(PointerParam[IntParam], ArrayParam[IntParam], CompositeParam.CompositeMember.A) + for(;;); + +#pragma acc loop private(LocalArray) private(LocalArray[2]) + for(;;); + +#pragma acc loop private(LocalArray, LocalArray[2]) + for(;;); + +#pragma acc loop private(LocalComposite, LocalComposite.ScalarMember) + for(;;); + +#pragma acc loop private(LocalComposite.CompositeMember.A, LocalComposite.ScalarMember) + for(;;); + +#pragma acc loop private(LocalComposite.CompositeMember.A) private(LocalComposite.ScalarMember) + for(;;); + + Complete LocalComposite2; +#pragma acc loop private(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember) + for(;;); + + // Invalid cases, arbitrary expressions. + struct Incomplete *I; + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(*I) + for(;;); + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(GlobalInt + IntParam) + for(;;); + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(+GlobalInt) + for(;;); + + // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}} +#pragma acc loop private(PointerParam[:]) + for(;;); +#pragma acc loop private(PointerParam[:5]) + for(;;); +#pragma acc loop private(PointerParam[:IntParam]) + for(;;); + // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}} +#pragma acc loop private(PointerParam[2:]) + for(;;); +#pragma acc loop private(PointerParam[2:5]) + for(;;); +#pragma acc loop private(PointerParam[2]) + for(;;); +#pragma acc loop private(ArrayParam[:]) + for(;;); +#pragma acc loop private(ArrayParam[:5]) + for(;;); +#pragma acc loop private(ArrayParam[:IntParam]) + for(;;); +#pragma acc loop private(ArrayParam[2:]) + for(;;); + // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}} +#pragma acc loop private(ArrayParam[2:5]) + for(;;); +#pragma acc loop private(ArrayParam[2]) + for(;;); + + // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}} + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private((float*)ArrayParam[2:5]) + for(;;); + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private((float)ArrayParam[2]) + for(;;); +} diff --git a/clang/test/SemaOpenACC/loop-construct-private-clause.cpp b/clang/test/SemaOpenACC/loop-construct-private-clause.cpp new file mode 100644 index 0000000..b5d3fc9 --- /dev/null +++ b/clang/test/SemaOpenACC/loop-construct-private-clause.cpp @@ -0,0 +1,155 @@ +// RUN: %clang_cc1 %s -fopenacc -verify + +struct Incomplete; +enum SomeE{}; +typedef struct IsComplete { + struct S { int A; } CompositeMember; + int ScalarMember; + float ArrayMember[5]; + SomeE EnumMember; + char *PointerMember; +} Complete; + +int GlobalInt; +float GlobalArray[5]; +char *GlobalPointer; +Complete GlobalComposite; + +void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) { + int LocalInt; + char *LocalPointer; + float LocalArray[5]; + Complete LocalComposite; + + // Check Appertainment: + +#pragma acc loop private(LocalInt) + for(;;); + + // Valid cases: +#pragma acc loop private(LocalInt, LocalPointer, LocalArray) + for(;;); +#pragma acc loop private(LocalArray) + for(;;); +#pragma acc loop private(LocalArray[2]) + for(;;); +#pragma acc loop private(LocalComposite) + for(;;); +#pragma acc loop private(LocalComposite.EnumMember) + for(;;); +#pragma acc loop private(LocalComposite.ScalarMember) + for(;;); +#pragma acc loop private(LocalComposite.ArrayMember) + for(;;); +#pragma acc loop private(LocalComposite.ArrayMember[5]) + for(;;); +#pragma acc loop private(LocalComposite.PointerMember) + for(;;); +#pragma acc loop private(GlobalInt, GlobalArray, GlobalPointer, GlobalComposite) + for(;;); +#pragma acc loop private(GlobalArray[2], GlobalPointer[2], GlobalComposite.CompositeMember.A) + for(;;); +#pragma acc loop private(LocalComposite, GlobalComposite) + for(;;); +#pragma acc loop private(IntParam, PointerParam, ArrayParam, CompositeParam) private(IntParamRef) + for(;;); +#pragma acc loop private(PointerParam[IntParam], ArrayParam[IntParam], CompositeParam.CompositeMember.A) + for(;;); + + + // Invalid cases, arbitrary expressions. + Incomplete *I; + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(*I) + for(;;); + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(GlobalInt + IntParam) + for(;;); + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(+GlobalInt) + for(;;); +} + +template<typename T, unsigned I, typename V> +void TemplUses(T t, T (&arrayT)[I], V TemplComp) { + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(+t) + for(;;); + + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(+I) + for(;;); + + // NTTP's are only valid if it is a reference to something. + // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} + // expected-note@#TEMPL_USES_INST{{in instantiation of}} +#pragma acc loop private(I) + for(;;); + + // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} +#pragma acc loop private(t, I) + for(;;); + +#pragma acc loop private(arrayT) + for(;;); + +#pragma acc loop private(TemplComp) + for(;;); + +#pragma acc loop private(TemplComp.PointerMember[5]) + for(;;); + +#pragma acc loop private(TemplComp.PointerMember[5]) private(TemplComp) + for(;;); + + int *Pointer; +#pragma acc loop private(Pointer[:I]) + for(;;); +#pragma acc loop private(Pointer[:t]) + for(;;); + // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}} +#pragma acc loop private(Pointer[1:]) + for(;;); +} + +template<unsigned I, auto &NTTP_REF> +void NTTP() { + // NTTP's are only valid if it is a reference to something. + // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}} + // expected-note@#NTTP_INST{{in instantiation of}} +#pragma acc loop private(I) + for(;;); + +#pragma acc loop private(NTTP_REF) + for(;;); +} + +struct S { + int ThisMember; + int ThisMemberArray[5]; + + void foo(); +}; + +void S::foo() { +#pragma acc loop private(ThisMember, this->ThisMemberArray[1]) + for(;;); + +#pragma acc loop private(ThisMemberArray[1:2]) + for(;;); + +#pragma acc loop private(this) + for(;;); + +#pragma acc loop private(ThisMember, this->ThisMember) + for(;;); +} + +void Inst() { + static constexpr int NTTP_REFed = 1; + int i; + int Arr[5]; + Complete C; + TemplUses(i, Arr, C); // #TEMPL_USES_INST + NTTP<5, NTTP_REFed>(); // #NTTP_INST +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl index 487cc53..2a1ba43 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl @@ -3,8 +3,10 @@ typedef unsigned int u32; -void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 size) { - __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{expression is not an integer constant expression}} +void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 size, u32 offset, u32 aux) { + __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} expected-note {{size must be 1, 2, or 4}} diff --git a/clang/unittests/AST/Interp/toAPValue.cpp b/clang/unittests/AST/Interp/toAPValue.cpp index e56453a..d6879d6 100644 --- a/clang/unittests/AST/Interp/toAPValue.cpp +++ b/clang/unittests/AST/Interp/toAPValue.cpp @@ -186,3 +186,49 @@ TEST(ToAPValue, FunctionPointersC) { ASSERT_EQ(I, 17); } } + +TEST(ToAPValue, MemberPointers) { + constexpr char Code[] = "struct S {\n" + " int m, n;\n" + "};\n" + "constexpr int S::*pm = &S::m;\n" + "constexpr int S::*nn = nullptr;\n"; + + auto AST = tooling::buildASTFromCodeWithArgs( + Code, {"-fexperimental-new-constant-interpreter"}); + + auto &Ctx = AST->getASTContext().getInterpContext(); + Program &Prog = Ctx.getProgram(); + + auto getDecl = [&](const char *Name) -> const ValueDecl * { + auto Nodes = + match(valueDecl(hasName(Name)).bind("var"), AST->getASTContext()); + assert(Nodes.size() == 1); + const auto *D = Nodes[0].getNodeAs<ValueDecl>("var"); + assert(D); + return D; + }; + + auto getGlobalPtr = [&](const char *Name) -> Pointer { + const VarDecl *D = cast<VarDecl>(getDecl(Name)); + return Prog.getPtrGlobal(*Prog.getGlobal(D)); + }; + + { + const Pointer &GP = getGlobalPtr("pm"); + ASSERT_TRUE(GP.isLive()); + const MemberPointer &FP = GP.deref<MemberPointer>(); + APValue A = FP.toAPValue(); + ASSERT_EQ(A.getMemberPointerDecl(), getDecl("m")); + ASSERT_EQ(A.getKind(), APValue::MemberPointer); + } + + { + const Pointer &GP = getGlobalPtr("nn"); + ASSERT_TRUE(GP.isLive()); + const MemberPointer &NP = GP.deref<MemberPointer>(); + ASSERT_TRUE(NP.isZero()); + APValue A = NP.toAPValue(); + ASSERT_EQ(A.getKind(), APValue::MemberPointer); + } +} diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp index 004ecb6..4e42726 100644 --- a/clang/unittests/Format/FormatTest.cpp +++ b/clang/unittests/Format/FormatTest.cpp @@ -24879,7 +24879,7 @@ TEST_F(FormatTest, SkipMacroDefinitionBody) { Style); // With comments. - verifyFormat("/* */ #define A a // a a", "/* */ # define A a // a a", + verifyFormat("/* */ #define A a // a a", "/* */ # define A a // a a", Style); verifyNoChange("/* */ #define A a // a a", Style); @@ -24891,6 +24891,15 @@ TEST_F(FormatTest, SkipMacroDefinitionBody) { "int aaa; // a", Style); + verifyNoChange( + "#define MACRO_WITH_COMMENTS() \\\n" + " public: \\\n" + " /* Documentation parsed by Doxygen for the following method. */ \\\n" + " static MyType getClassTypeId(); \\\n" + " /** Normal comment for the following method. */ \\\n" + " virtual MyType getTypeId() const;", + Style); + // multiline macro definitions verifyNoChange("#define A a\\\n" " A a \\\n " diff --git a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp index 044c3d6..59fef9e 100644 --- a/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp +++ b/clang/unittests/Lex/DependencyDirectivesScannerTest.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "clang/Lex/DependencyDirectivesScanner.h" -#include "clang/Basic/TokenKinds.h" #include "llvm/ADT/SmallString.h" #include "gtest/gtest.h" @@ -18,11 +17,11 @@ using namespace clang::dependency_directives_scan; static bool minimizeSourceToDependencyDirectives( StringRef Input, SmallVectorImpl<char> &Out, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, - SmallVectorImpl<Directive> &Directives, const LangOptions &LangOpts) { + SmallVectorImpl<Directive> &Directives) { Out.clear(); Tokens.clear(); Directives.clear(); - if (scanSourceForDependencyDirectives(Input, Tokens, Directives, LangOpts)) + if (scanSourceForDependencyDirectives(Input, Tokens, Directives)) return true; raw_svector_ostream OS(Out); @@ -39,9 +38,7 @@ static bool minimizeSourceToDependencyDirectives(StringRef Input, SmallVectorImpl<char> &Out) { SmallVector<dependency_directives_scan::Token, 16> Tokens; SmallVector<Directive, 32> Directives; - LangOptions LangOpts; - return minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives, - LangOpts); + return minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives); } namespace { @@ -50,17 +47,16 @@ TEST(MinimizeSourceToDependencyDirectivesTest, Empty) { SmallVector<char, 128> Out; SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - LangOptions LangOpts; - ASSERT_FALSE(minimizeSourceToDependencyDirectives("", Out, Tokens, Directives, - LangOpts)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives("", Out, Tokens, Directives)); EXPECT_TRUE(Out.empty()); EXPECT_TRUE(Tokens.empty()); ASSERT_EQ(1u, Directives.size()); ASSERT_EQ(pp_eof, Directives.back().Kind); ASSERT_FALSE(minimizeSourceToDependencyDirectives("abc def\nxyz", Out, Tokens, - Directives, LangOpts)); + Directives)); EXPECT_STREQ("<TokBeforeEOF>\n", Out.data()); EXPECT_TRUE(Tokens.empty()); ASSERT_EQ(2u, Directives.size()); @@ -72,7 +68,6 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) { SmallVector<char, 128> Out; SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - LangOptions LangOpts; ASSERT_FALSE( minimizeSourceToDependencyDirectives("#define A\n" @@ -97,7 +92,7 @@ TEST(MinimizeSourceToDependencyDirectivesTest, AllTokens) { "export module m;\n" "import m;\n" "#pragma clang system_header\n", - Out, Tokens, Directives, LangOpts)); + Out, Tokens, Directives)); EXPECT_EQ(pp_define, Directives[0].Kind); EXPECT_EQ(pp_undef, Directives[1].Kind); EXPECT_EQ(pp_endif, Directives[2].Kind); @@ -150,10 +145,9 @@ TEST(MinimizeSourceToDependencyDirectivesTest, Define) { SmallVector<char, 128> Out; SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - LangOptions LangOpts; - ASSERT_FALSE(minimizeSourceToDependencyDirectives( - "#define MACRO", Out, Tokens, Directives, LangOpts)); + ASSERT_FALSE(minimizeSourceToDependencyDirectives("#define MACRO", Out, + Tokens, Directives)); EXPECT_STREQ("#define MACRO\n", Out.data()); ASSERT_EQ(4u, Tokens.size()); ASSERT_EQ(2u, Directives.size()); @@ -844,7 +838,6 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) { SmallVector<char, 128> Out; SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - LangOptions LangOpts; StringRef Source = R"(// comment #pragma once @@ -852,8 +845,8 @@ TEST(MinimizeSourceToDependencyDirectivesTest, PragmaOnce) { #include <test.h> _Pragma("once") )"; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Tokens, - Directives, LangOpts)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives)); EXPECT_STREQ("#pragma once\n#include <test.h>\n_Pragma(\"once\")\n", Out.data()); ASSERT_EQ(Directives.size(), 4u); @@ -933,7 +926,6 @@ TEST(MinimizeSourceToDependencyDirectivesTest, CxxModules) { SmallVector<char, 128> Out; SmallVector<dependency_directives_scan::Token, 4> Tokens; SmallVector<Directive, 4> Directives; - LangOptions LangOpts; StringRef Source = R"( module; @@ -962,8 +954,8 @@ ort \ import f(->a = 3); } )"; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Tokens, - Directives, LangOpts)); + ASSERT_FALSE( + minimizeSourceToDependencyDirectives(Source, Out, Tokens, Directives)); EXPECT_STREQ("#include \"textual-header.h\"\nexport module m;" "exp\\\nort import:l[[rename]];" "import<<=3;import a b d e d e f e;" @@ -1020,52 +1012,4 @@ TEST(MinimizeSourceToDependencyDirectivesTest, TokensBeforeEOF) { EXPECT_STREQ("#ifndef A\n#define A\n#endif\n<TokBeforeEOF>\n", Out.data()); } -TEST(MinimizeSourceToDependencyDirectivesTest, CPlusPlus14PPNumber) { - SmallVector<char, 128> Out; - SmallVector<dependency_directives_scan::Token, 4> Tokens; - SmallVector<Directive, 4> Directives; - LangOptions LangOpts; - - StringRef Source = R"( -#if 123'124 -#endif -)"; - - LangOpts.CPlusPlus14 = true; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Tokens, - Directives, LangOpts)); - EXPECT_STREQ("#if 123'124\n#endif\n", Out.data()); - ASSERT_EQ(Directives.size(), 3u); - EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_if); - EXPECT_EQ(Directives[1].Kind, dependency_directives_scan::pp_endif); - EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::pp_eof); - ASSERT_EQ(Tokens.size(), 7u); - - ASSERT_TRUE(Tokens[0].is(tok::hash)); - ASSERT_TRUE(Tokens[1].is(tok::raw_identifier)); // "if" - ASSERT_TRUE(Tokens[2].is(tok::numeric_constant)); // 123'124 - ASSERT_TRUE(Tokens[3].is(tok::eod)); - ASSERT_TRUE(Tokens[4].is(tok::hash)); - ASSERT_TRUE(Tokens[5].is(tok::raw_identifier)); // #endif - ASSERT_TRUE(Tokens[6].is(tok::eod)); - - LangOpts.CPlusPlus14 = false; - ASSERT_FALSE(minimizeSourceToDependencyDirectives(Source, Out, Tokens, - Directives, LangOpts)); - EXPECT_STREQ("#if 123'124\n#endif\n", Out.data()); - ASSERT_EQ(Directives.size(), 3u); - EXPECT_EQ(Directives[0].Kind, dependency_directives_scan::pp_if); - EXPECT_EQ(Directives[1].Kind, dependency_directives_scan::pp_endif); - EXPECT_EQ(Directives[2].Kind, dependency_directives_scan::pp_eof); - ASSERT_EQ(Tokens.size(), 8u); - ASSERT_TRUE(Tokens[0].is(tok::hash)); - ASSERT_TRUE(Tokens[1].is(tok::raw_identifier)); // "if" - ASSERT_TRUE(Tokens[2].is(tok::numeric_constant)); // 123 - ASSERT_TRUE(Tokens[3].is(tok::unknown)); // '124 - ASSERT_TRUE(Tokens[4].is(tok::eod)); - ASSERT_TRUE(Tokens[5].is(tok::hash)); - ASSERT_TRUE(Tokens[6].is(tok::raw_identifier)); // #endif - ASSERT_TRUE(Tokens[7].is(tok::eod)); -} - } // end anonymous namespace diff --git a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp index 410f378..6ff87f7 100644 --- a/clang/unittests/Lex/PPDependencyDirectivesTest.cpp +++ b/clang/unittests/Lex/PPDependencyDirectivesTest.cpp @@ -104,7 +104,6 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) { SmallVector<dependency_directives_scan::Directive> Directives; }; SmallVector<std::unique_ptr<DepDirectives>> DepDirectivesObjects; - LangOptions LangOpts; auto getDependencyDirectives = [&](FileEntryRef File) -> std::optional<ArrayRef<dependency_directives_scan::Directive>> { @@ -112,7 +111,7 @@ TEST_F(PPDependencyDirectivesTest, MacroGuard) { StringRef Input = (*FileMgr.getBufferForFile(File))->getBuffer(); bool Err = scanSourceForDependencyDirectives( Input, DepDirectivesObjects.back()->Tokens, - DepDirectivesObjects.back()->Directives, LangOpts); + DepDirectivesObjects.back()->Directives); EXPECT_FALSE(Err); return llvm::ArrayRef(DepDirectivesObjects.back()->Directives); }; diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html index b046468..4385744 100755 --- a/clang/www/cxx_dr_status.html +++ b/clang/www/cxx_dr_status.html @@ -10698,7 +10698,7 @@ and <I>POD class</I></td> <td><a href="https://cplusplus.github.io/CWG/issues/1815.html">1815</a></td> <td>CD4</td> <td>Lifetime extension in aggregate initialization</td> - <td class="unreleased" align="center">Clang 19</td> + <td class="none" align="center">No</td> </tr> <tr id="1816"> <td><a href="https://cplusplus.github.io/CWG/issues/1816.html">1816</a></td> diff --git a/compiler-rt/lib/builtins/atomic.c b/compiler-rt/lib/builtins/atomic.c index c3a36a9..852bb20 100644 --- a/compiler-rt/lib/builtins/atomic.c +++ b/compiler-rt/lib/builtins/atomic.c @@ -51,14 +51,6 @@ #endif static const long SPINLOCK_MASK = SPINLOCK_COUNT - 1; -#ifndef CACHE_LINE_SIZE -#define CACHE_LINE_SIZE 64 -#endif - -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wgnu-designator" -#endif - //////////////////////////////////////////////////////////////////////////////// // Platform-specific lock implementation. Falls back to spinlocks if none is // defined. Each platform should define the Lock type, and corresponding @@ -102,18 +94,21 @@ static Lock locks[SPINLOCK_COUNT]; // initialized to OS_SPINLOCK_INIT which is 0 #else _Static_assert(__atomic_always_lock_free(sizeof(uintptr_t), 0), "Implementation assumes lock-free pointer-size cmpxchg"); -#include <pthread.h> -#include <stdalign.h> -typedef struct { - alignas(CACHE_LINE_SIZE) pthread_mutex_t m; -} Lock; +typedef _Atomic(uintptr_t) Lock; /// Unlock a lock. This is a release operation. -__inline static void unlock(Lock *l) { pthread_mutex_unlock(&l->m); } -/// Locks a lock. -__inline static void lock(Lock *l) { pthread_mutex_lock(&l->m); } +__inline static void unlock(Lock *l) { + __c11_atomic_store(l, 0, __ATOMIC_RELEASE); +} +/// Locks a lock. In the current implementation, this is potentially +/// unbounded in the contended case. +__inline static void lock(Lock *l) { + uintptr_t old = 0; + while (!__c11_atomic_compare_exchange_weak(l, &old, 1, __ATOMIC_ACQUIRE, + __ATOMIC_RELAXED)) + old = 0; +} /// locks for atomic operations -static Lock locks[SPINLOCK_COUNT] = { - [0 ... SPINLOCK_COUNT - 1] = {PTHREAD_MUTEX_INITIALIZER}}; +static Lock locks[SPINLOCK_COUNT]; #endif /// Returns a lock to use for a given pointer. diff --git a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp index c0168ab..8c5c5da 100644 --- a/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp +++ b/cross-project-tests/debuginfo-tests/llvm-prettyprinters/gdb/llvm-support.cpp @@ -18,7 +18,7 @@ llvm::MutableArrayRef<int> MutableArrayRef(Array); llvm::DenseMap<int, int> DenseMap = {{4, 5}, {6, 7}}; llvm::StringMap<int> StringMap = {{"foo", 123}, {"bar", 456}}; llvm::Expected<int> ExpectedValue(8); -llvm::Expected<int> ExpectedError(llvm::createStringError({}, "")); +llvm::Expected<int> ExpectedError(llvm::createStringError("")); std::optional<int> OptionalValue(9); std::optional<int> OptionalNone(std::nullopt); llvm::SmallVector<int, 5> SmallVector = {10, 11, 12}; diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 0af12c8..6407be5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -101,6 +101,7 @@ macro(enable_omp_offload_compilation files) "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" + "gfx1152" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md index 41129b1..8853d4d 100644 --- a/flang/docs/Intrinsics.md +++ b/flang/docs/Intrinsics.md @@ -967,4 +967,35 @@ program test_etime print *, tarray(1) print *, tarray(2) end program test_etime +``` + +### Non-Standard Intrinsics: GETCWD + +#### Description +`GETCWD(C, STATUS)` returns current working directory. + +This intrinsic is provided in both subroutine and function forms; however, only one form can be used in any given program unit. + +*C* and *STATUS* are `INTENT(OUT)` and provide the following: + +| | | +|------------|---------------------------------------------------------------------------------------------------| +| `C` | Current work directory. The type shall be `CHARACTER` and of default kind. | +| `STATUS` | (Optional) Status flag. Returns 0 on success, a system specific and nonzero error code otherwise. The type shall be `INTEGER` and of a kind greater or equal to 4. | + +#### Usage and Info + +- **Standard:** GNU extension +- **Class:** Subroutine, function +- **Syntax:** `CALL GETCWD(C, STATUS)`, `STATUS = GETCWD(C)` + +#### Example +```Fortran +PROGRAM example_getcwd + CHARACTER(len=255) :: cwd + INTEGER :: status + CALL getcwd(cwd, status) + PRINT *, cwd + PRINT *, status +END PROGRAM ```
\ No newline at end of file diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index c47e41e..8ef5d59 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -232,6 +232,8 @@ struct IntrinsicLibrary { mlir::Value genFloor(mlir::Type, llvm::ArrayRef<mlir::Value>); mlir::Value genFraction(mlir::Type resultType, mlir::ArrayRef<mlir::Value> args); + fir::ExtendedValue genGetCwd(std::optional<mlir::Type> resultType, + llvm::ArrayRef<fir::ExtendedValue> args); void genGetCommand(mlir::ArrayRef<fir::ExtendedValue> args); mlir::Value genGetPID(mlir::Type resultType, llvm::ArrayRef<mlir::Value> args); diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Command.h b/flang/include/flang/Optimizer/Builder/Runtime/Command.h index 976fb3a..0d60a36 100644 --- a/flang/include/flang/Optimizer/Builder/Runtime/Command.h +++ b/flang/include/flang/Optimizer/Builder/Runtime/Command.h @@ -53,5 +53,10 @@ mlir::Value genGetEnvVariable(fir::FirOpBuilder &, mlir::Location, mlir::Value length, mlir::Value trimName, mlir::Value errmsg); +/// Generate a call to the GetCwd runtime function which implements +/// the GETCWD intrinsic. +mlir::Value genGetCwd(fir::FirOpBuilder &builder, mlir::Location loc, + mlir::Value c); + } // namespace fir::runtime #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_COMMAND_H diff --git a/flang/include/flang/Optimizer/Dialect/FIRAttr.td b/flang/include/flang/Optimizer/Dialect/FIRAttr.td index 0c34b64..aedb676 100644 --- a/flang/include/flang/Optimizer/Dialect/FIRAttr.td +++ b/flang/include/flang/Optimizer/Dialect/FIRAttr.td @@ -67,6 +67,36 @@ def fir_BoxFieldAttr : I32EnumAttr< let cppNamespace = "fir"; } +def fir_ReduceOperationEnum : I32BitEnumAttr<"ReduceOperationEnum", + "intrinsic operations and functions supported by DO CONCURRENT REDUCE", + [ + I32BitEnumAttrCaseBit<"Add", 0, "add">, + I32BitEnumAttrCaseBit<"Multiply", 1, "multiply">, + I32BitEnumAttrCaseBit<"AND", 2, "and">, + I32BitEnumAttrCaseBit<"OR", 3, "or">, + I32BitEnumAttrCaseBit<"EQV", 4, "eqv">, + I32BitEnumAttrCaseBit<"NEQV", 5, "neqv">, + I32BitEnumAttrCaseBit<"MAX", 6, "max">, + I32BitEnumAttrCaseBit<"MIN", 7, "min">, + I32BitEnumAttrCaseBit<"IAND", 8, "iand">, + I32BitEnumAttrCaseBit<"IOR", 9, "ior">, + I32BitEnumAttrCaseBit<"EIOR", 10, "eior"> + ]> { + let separator = ", "; + let cppNamespace = "::fir"; + let printBitEnumPrimaryGroups = 1; +} + +def fir_ReduceAttr : fir_Attr<"Reduce"> { + let mnemonic = "reduce_attr"; + + let parameters = (ins + "ReduceOperationEnum":$reduce_operation + ); + + let assemblyFormat = "`<` $reduce_operation `>`"; +} + // mlir::SideEffects::Resource for modelling operations which add debugging information def DebuggingResource : Resource<"::fir::DebuggingResource">; diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td index 37fbd1f..e7da3af 100644 --- a/flang/include/flang/Optimizer/Dialect/FIROps.td +++ b/flang/include/flang/Optimizer/Dialect/FIROps.td @@ -2125,8 +2125,8 @@ class region_Op<string mnemonic, list<Trait> traits = []> : let hasVerifier = 1; } -def fir_DoLoopOp : region_Op<"do_loop", - [DeclareOpInterfaceMethods<LoopLikeOpInterface, +def fir_DoLoopOp : region_Op<"do_loop", [AttrSizedOperandSegments, + DeclareOpInterfaceMethods<LoopLikeOpInterface, ["getYieldedValuesMutable"]>]> { let summary = "generalized loop operation"; let description = [{ @@ -2156,9 +2156,11 @@ def fir_DoLoopOp : region_Op<"do_loop", Index:$lowerBound, Index:$upperBound, Index:$step, + Variadic<AnyType>:$reduceOperands, Variadic<AnyType>:$initArgs, OptionalAttr<UnitAttr>:$unordered, - OptionalAttr<UnitAttr>:$finalValue + OptionalAttr<UnitAttr>:$finalValue, + OptionalAttr<ArrayAttr>:$reduceAttrs ); let results = (outs Variadic<AnyType>:$results); let regions = (region SizedRegion<1>:$region); @@ -2169,6 +2171,8 @@ def fir_DoLoopOp : region_Op<"do_loop", "mlir::Value":$step, CArg<"bool", "false">:$unordered, CArg<"bool", "false">:$finalCountValue, CArg<"mlir::ValueRange", "std::nullopt">:$iterArgs, + CArg<"mlir::ValueRange", "std::nullopt">:$reduceOperands, + CArg<"llvm::ArrayRef<mlir::Attribute>", "{}">:$reduceAttrs, CArg<"llvm::ArrayRef<mlir::NamedAttribute>", "{}">:$attributes)> ]; @@ -2181,11 +2185,12 @@ def fir_DoLoopOp : region_Op<"do_loop", return getBody()->getArguments().drop_front(); } mlir::Operation::operand_range getIterOperands() { - return getOperands().drop_front(getNumControlOperands()); + return getOperands() + .drop_front(getNumControlOperands() + getNumReduceOperands()); } llvm::MutableArrayRef<mlir::OpOperand> getInitsMutable() { - return - getOperation()->getOpOperands().drop_front(getNumControlOperands()); + return getOperation()->getOpOperands() + .drop_front(getNumControlOperands() + getNumReduceOperands()); } void setLowerBound(mlir::Value bound) { (*this)->setOperand(0, bound); } @@ -2200,11 +2205,25 @@ def fir_DoLoopOp : region_Op<"do_loop", unsigned getNumControlOperands() { return 3; } /// Does the operation hold operands for loop-carried values bool hasIterOperands() { - return (*this)->getNumOperands() > getNumControlOperands(); + return getNumIterOperands() > 0; + } + /// Does the operation hold operands for reduction variables + bool hasReduceOperands() { + return getNumReduceOperands() > 0; + } + /// Get Number of variadic operands + unsigned getNumOperands(unsigned idx) { + auto segments = (*this)->getAttrOfType<mlir::DenseI32ArrayAttr>( + getOperandSegmentSizeAttr()); + return static_cast<unsigned>(segments[idx]); + } + // Get Number of reduction operands + unsigned getNumReduceOperands() { + return getNumOperands(3); } /// Get Number of loop-carried values unsigned getNumIterOperands() { - return (*this)->getNumOperands() - getNumControlOperands(); + return getNumOperands(4); } /// Get the body of the loop diff --git a/flang/include/flang/Optimizer/Transforms/Passes.h b/flang/include/flang/Optimizer/Transforms/Passes.h index a7ba704..2d43f4d 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.h +++ b/flang/include/flang/Optimizer/Transforms/Passes.h @@ -60,9 +60,6 @@ std::unique_ptr<mlir::Pass> createAffineDemotionPass(); std::unique_ptr<mlir::Pass> createArrayValueCopyPass(fir::ArrayValueCopyOptions options = {}); std::unique_ptr<mlir::Pass> createCFGConversionPassWithNSW(); -std::unique_ptr<mlir::Pass> createExternalNameConversionPass(); -std::unique_ptr<mlir::Pass> -createExternalNameConversionPass(bool appendUnderscore); std::unique_ptr<mlir::Pass> createMemDataFlowOptPass(); std::unique_ptr<mlir::Pass> createPromoteToAffinePass(); std::unique_ptr<mlir::Pass> diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td index 8263820..cac590a8 100644 --- a/flang/include/flang/Optimizer/Transforms/Passes.td +++ b/flang/include/flang/Optimizer/Transforms/Passes.td @@ -163,7 +163,6 @@ def ExternalNameConversion : Pass<"external-name-interop", "mlir::ModuleOp"> { let description = [{ Demangle FIR internal name and mangle them for external interoperability. }]; - let constructor = "::fir::createExternalNameConversionPass()"; let options = [ Option<"appendUnderscoreOpt", "append-underscore", "bool", /*default=*/"true", diff --git a/flang/include/flang/Runtime/command.h b/flang/include/flang/Runtime/command.h index c67d171..7ab3f64 100644 --- a/flang/include/flang/Runtime/command.h +++ b/flang/include/flang/Runtime/command.h @@ -55,6 +55,10 @@ std::int32_t RTNAME(GetEnvVariable)(const Descriptor &name, const Descriptor *value = nullptr, const Descriptor *length = nullptr, bool trim_name = true, const Descriptor *errmsg = nullptr, const char *sourceFile = nullptr, int line = 0); + +// Calls getcwd() +std::int32_t RTNAME(GetCwd)( + const Descriptor &cwd, const char *sourceFile, int line); } } // namespace Fortran::runtime diff --git a/flang/include/flang/Runtime/magic-numbers.h b/flang/include/flang/Runtime/magic-numbers.h index 38ccc5e..1cded1f 100644 --- a/flang/include/flang/Runtime/magic-numbers.h +++ b/flang/include/flang/Runtime/magic-numbers.h @@ -69,6 +69,11 @@ Additional status code for a bad pointer DEALLOCATE. #define FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION 110 #if 0 +Status codes for GETCWD. +#endif +#define FORTRAN_RUNTIME_STAT_MISSING_CWD 111 + +#if 0 ieee_class_type values The sequence is that of F18 Clause 17.2p3, but nothing depends on that. #endif diff --git a/flang/include/flang/Tools/CLOptions.inc b/flang/include/flang/Tools/CLOptions.inc index c5c35e9..d0399d6 100644 --- a/flang/include/flang/Tools/CLOptions.inc +++ b/flang/include/flang/Tools/CLOptions.inc @@ -233,9 +233,8 @@ inline void addBoxedProcedurePass(mlir::PassManager &pm) { inline void addExternalNameConversionPass( mlir::PassManager &pm, bool appendUnderscore = true) { - addPassConditionally(pm, disableExternalNameConversion, [&]() { - return fir::createExternalNameConversionPass(appendUnderscore); - }); + addPassConditionally(pm, disableExternalNameConversion, + [&]() { return fir::createExternalNameConversion({appendUnderscore}); }); } // Use inliner extension point callback to register the default inliner pass. diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp index 12d1342..58c1b69 100644 --- a/flang/lib/Evaluate/intrinsics.cpp +++ b/flang/lib/Evaluate/intrinsics.cpp @@ -514,6 +514,10 @@ static const IntrinsicInterface genericIntrinsicFunction[]{ {"gamma", {{"x", SameReal}}, SameReal}, {"get_team", {{"level", DefaultInt, Rank::scalar, Optionality::optional}}, TeamType, Rank::scalar, IntrinsicClass::transformationalFunction}, + {"getcwd", + {{"c", DefaultChar, Rank::scalar, Optionality::required, + common::Intent::Out}}, + TypePattern{IntType, KindCode::greaterOrEqualToKind, 4}}, {"getpid", {}, DefaultInt}, {"huge", {{"x", SameIntOrReal, Rank::anyOrAssumedRank, Optionality::required, @@ -1406,6 +1410,12 @@ static const IntrinsicInterface intrinsicSubroutine[]{ {"errmsg", DefaultChar, Rank::scalar, Optionality::optional, common::Intent::InOut}}, {}, Rank::elemental, IntrinsicClass::impureSubroutine}, + {"getcwd", + {{"c", DefaultChar, Rank::scalar, Optionality::required, + common::Intent::Out}, + {"status", TypePattern{IntType, KindCode::greaterOrEqualToKind, 4}, + Rank::scalar, Optionality::optional, common::Intent::Out}}, + {}, Rank::elemental, IntrinsicClass::impureSubroutine}, {"move_alloc", {{"from", SameType, Rank::known, Optionality::required, common::Intent::InOut}, @@ -2574,7 +2584,7 @@ bool IntrinsicProcTable::Implementation::IsDualIntrinsic( const std::string &name) const { // Collection for some intrinsics with function and subroutine form, // in order to pass the semantic check. - static const std::string dualIntrinsic[]{{"etime"}}; + static const std::string dualIntrinsic[]{{"etime"}, {"getcwd"}}; return std::find_if(std::begin(dualIntrinsic), std::end(dualIntrinsic), [&name](const std::string &dualName) { diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp index 68619f6..d289f2f 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp @@ -175,7 +175,7 @@ static void addUseDeviceClause( useDeviceLocs.push_back(operand.getLoc()); } for (const omp::Object &object : objects) - useDeviceSyms.push_back(object.id()); + useDeviceSyms.push_back(object.sym()); } static void convertLoopBounds(lower::AbstractConverter &converter, @@ -525,7 +525,7 @@ bool ClauseProcessor::processCopyin() const { bool hasCopyin = findRepeatableClause<omp::clause::Copyin>( [&](const omp::clause::Copyin &clause, const parser::CharBlock &) { for (const omp::Object &object : clause.v) { - semantics::Symbol *sym = object.id(); + semantics::Symbol *sym = object.sym(); assert(sym && "Expecting symbol"); if (const auto *commonDetails = sym->detailsIf<semantics::CommonBlockDetails>()) { @@ -698,7 +698,7 @@ bool ClauseProcessor::processCopyprivate( bool hasCopyPrivate = findRepeatableClause<clause::Copyprivate>( [&](const clause::Copyprivate &clause, const parser::CharBlock &) { for (const Object &object : clause.v) { - semantics::Symbol *sym = object.id(); + semantics::Symbol *sym = object.sym(); if (const auto *commonDetails = sym->detailsIf<semantics::CommonBlockDetails>()) { for (const auto &mem : commonDetails->objects()) @@ -739,7 +739,7 @@ bool ClauseProcessor::processDepend(mlir::omp::DependClauseOps &result) const { "array sections not supported for task depend"); } - semantics::Symbol *sym = object.id(); + semantics::Symbol *sym = object.sym(); const mlir::Value variable = converter.getSymbolAddress(*sym); result.dependVars.push_back(variable); } @@ -870,11 +870,11 @@ bool ClauseProcessor::processMap( lower::AddrAndBoundsInfo info = lower::gatherDataOperandAddrAndBounds<mlir::omp::MapBoundsOp, mlir::omp::MapBoundsType>( - converter, firOpBuilder, semaCtx, stmtCtx, *object.id(), + converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(), object.ref(), clauseLocation, asFortran, bounds, treatIndexAsSection); - auto origSymbol = converter.getSymbolAddress(*object.id()); + auto origSymbol = converter.getSymbolAddress(*object.sym()); mlir::Value symAddr = info.addr; if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) symAddr = origSymbol; @@ -894,12 +894,12 @@ bool ClauseProcessor::processMap( mapTypeBits), mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); - if (object.id()->owner().IsDerivedType()) { + if (object.sym()->owner().IsDerivedType()) { addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, semaCtx); } else { result.mapVars.push_back(mapOp); - ptrMapSyms->push_back(object.id()); + ptrMapSyms->push_back(object.sym()); if (mapSymTypes) mapSymTypes->push_back(symAddr.getType()); if (mapSymLocs) diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h index 4d3d444..28f2669 100644 --- a/flang/lib/Lower/OpenMP/ClauseProcessor.h +++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h @@ -205,11 +205,11 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, lower::AddrAndBoundsInfo info = lower::gatherDataOperandAddrAndBounds<mlir::omp::MapBoundsOp, mlir::omp::MapBoundsType>( - converter, firOpBuilder, semaCtx, stmtCtx, *object.id(), + converter, firOpBuilder, semaCtx, stmtCtx, *object.sym(), object.ref(), clauseLocation, asFortran, bounds, treatIndexAsSection); - auto origSymbol = converter.getSymbolAddress(*object.id()); + auto origSymbol = converter.getSymbolAddress(*object.sym()); mlir::Value symAddr = info.addr; if (origSymbol && fir::isTypeWithDescriptor(origSymbol.getType())) symAddr = origSymbol; @@ -226,12 +226,12 @@ bool ClauseProcessor::processMotionClauses(lower::StatementContext &stmtCtx, mapTypeBits), mlir::omp::VariableCaptureKind::ByRef, symAddr.getType()); - if (object.id()->owner().IsDerivedType()) { + if (object.sym()->owner().IsDerivedType()) { addChildIndexAndMapToParent(object, parentMemberIndices, mapOp, semaCtx); } else { result.mapVars.push_back(mapOp); - mapSymbols.push_back(object.id()); + mapSymbols.push_back(object.sym()); } } }); diff --git a/flang/lib/Lower/OpenMP/Clauses.h b/flang/lib/Lower/OpenMP/Clauses.h index 5391b13..98fb5dc 100644 --- a/flang/lib/Lower/OpenMP/Clauses.h +++ b/flang/lib/Lower/OpenMP/Clauses.h @@ -21,6 +21,10 @@ #include <type_traits> #include <utility> +namespace Fortran::semantics { +class Symbol; +} + namespace Fortran::lower::omp { using namespace Fortran; using SomeExpr = semantics::SomeExpr; @@ -32,29 +36,64 @@ struct TypeTy : public evaluate::SomeType { bool operator==(const TypeTy &t) const { return true; } }; -using IdTy = semantics::Symbol *; +template <typename ExprTy> +struct IdTyTemplate { + // "symbol" is always non-null for id's of actual objects. + Fortran::semantics::Symbol *symbol; + std::optional<ExprTy> designator; + + bool operator==(const IdTyTemplate &other) const { + // If symbols are different, then the objects are different. + if (symbol != other.symbol) + return false; + if (symbol == nullptr) + return true; + // Equal symbols don't necessarily indicate identical objects, + // for example, a derived object component may use a single symbol, + // which will refer to different objects for different designators, + // e.g. a%c and b%c. + return designator == other.designator; + } + + operator bool() const { return symbol != nullptr; } +}; + using ExprTy = SomeExpr; template <typename T> using List = tomp::ListT<T>; } // namespace Fortran::lower::omp +// Specialization of the ObjectT template namespace tomp::type { template <> -struct ObjectT<Fortran::lower::omp::IdTy, Fortran::lower::omp::ExprTy> { - using IdTy = Fortran::lower::omp::IdTy; +struct ObjectT<Fortran::lower::omp::IdTyTemplate<Fortran::lower::omp::ExprTy>, + Fortran::lower::omp::ExprTy> { + using IdTy = Fortran::lower::omp::IdTyTemplate<Fortran::lower::omp::ExprTy>; using ExprTy = Fortran::lower::omp::ExprTy; - const IdTy &id() const { return symbol; } - const std::optional<ExprTy> &ref() const { return designator; } + IdTy id() const { return identity; } + Fortran::semantics::Symbol *sym() const { return identity.symbol; } + const std::optional<ExprTy> &ref() const { return identity.designator; } - IdTy symbol; - std::optional<ExprTy> designator; + IdTy identity; }; } // namespace tomp::type namespace Fortran::lower::omp { +using IdTy = IdTyTemplate<ExprTy>; +} + +namespace std { +template <> +struct hash<Fortran::lower::omp::IdTy> { + size_t operator()(const Fortran::lower::omp::IdTy &id) const { + return static_cast<size_t>(reinterpret_cast<uintptr_t>(id.symbol)); + } +}; +} // namespace std +namespace Fortran::lower::omp { using Object = tomp::ObjectT<IdTy, ExprTy>; using ObjectList = tomp::ObjectListT<IdTy, ExprTy>; diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index 557a968..b206040 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -139,7 +139,7 @@ void DataSharingProcessor::collectOmpObjectListSymbol( const omp::ObjectList &objects, llvm::SetVector<const semantics::Symbol *> &symbolSet) { for (const omp::Object &object : objects) - symbolSet.insert(object.id()); + symbolSet.insert(object.sym()); } void DataSharingProcessor::collectSymbolsForPrivatization() { diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.h b/flang/lib/Lower/OpenMP/DataSharingProcessor.h index 80a956d..fb340e6 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.h +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.h @@ -44,7 +44,9 @@ private: void Post(const T &) {} bool Pre(const parser::OpenMPConstruct &omp) { - currentConstruct = &omp; + // Skip constructs that may not have privatizations. + if (!std::holds_alternative<parser::OpenMPCriticalConstruct>(omp.u)) + currentConstruct = &omp; return true; } diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index af9e2af..f84440d 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -1434,7 +1434,7 @@ genSectionsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, mlir::OpBuilder::InsertPoint insp = builder.saveInsertionPoint(); const auto &objList = std::get<ObjectList>(lastp->t); for (const Object &object : objList) { - semantics::Symbol *sym = object.id(); + semantics::Symbol *sym = object.sym(); converter.copyHostAssociateVar(*sym, &insp); } } diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp index 1a63e31..60e933f 100644 --- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp +++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp @@ -37,7 +37,7 @@ namespace omp { ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( const omp::clause::ProcedureDesignator &pd) { auto redType = llvm::StringSwitch<std::optional<ReductionIdentifier>>( - getRealName(pd.v.id()).ToString()) + getRealName(pd.v.sym()).ToString()) .Case("max", ReductionIdentifier::MAX) .Case("min", ReductionIdentifier::MIN) .Case("iand", ReductionIdentifier::IAND) @@ -72,7 +72,7 @@ ReductionProcessor::ReductionIdentifier ReductionProcessor::getReductionType( bool ReductionProcessor::supportedIntrinsicProcReduction( const omp::clause::ProcedureDesignator &pd) { - semantics::Symbol *sym = pd.v.id(); + semantics::Symbol *sym = pd.v.sym(); if (!sym->GetUltimate().attrs().test(semantics::Attr::INTRINSIC)) return false; auto redType = llvm::StringSwitch<bool>(getRealName(sym).ToString()) @@ -707,7 +707,7 @@ void ReductionProcessor::addDeclareReduction( // should happen byref fir::FirOpBuilder &builder = converter.getFirOpBuilder(); for (const Object &object : objectList) { - const semantics::Symbol *symbol = object.id(); + const semantics::Symbol *symbol = object.sym(); if (reductionSymbols) reductionSymbols->push_back(symbol); mlir::Value symVal = converter.getSymbolAddress(*symbol); @@ -825,7 +825,7 @@ ReductionProcessor::getRealName(const semantics::Symbol *symbol) { const semantics::SourceName ReductionProcessor::getRealName(const omp::clause::ProcedureDesignator &pd) { - return getRealName(pd.v.id()); + return getRealName(pd.v.sym()); } int ReductionProcessor::getOperationIdentity(ReductionIdentifier redId, diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp index 4d665e6..da94352 100644 --- a/flang/lib/Lower/OpenMP/Utils.cpp +++ b/flang/lib/Lower/OpenMP/Utils.cpp @@ -55,7 +55,7 @@ void genObjectList(const ObjectList &objects, lower::AbstractConverter &converter, llvm::SmallVectorImpl<mlir::Value> &operands) { for (const Object &object : objects) { - const semantics::Symbol *sym = object.id(); + const semantics::Symbol *sym = object.sym(); assert(sym && "Expected Symbol"); if (mlir::Value variable = converter.getSymbolAddress(*sym)) { operands.push_back(variable); @@ -107,7 +107,7 @@ void gatherFuncAndVarSyms( const ObjectList &objects, mlir::omp::DeclareTargetCaptureClause clause, llvm::SmallVectorImpl<DeclareTargetCapturePair> &symbolAndClause) { for (const Object &object : objects) - symbolAndClause.emplace_back(clause, *object.id()); + symbolAndClause.emplace_back(clause, *object.sym()); } mlir::omp::MapInfoOp @@ -175,7 +175,7 @@ generateMemberPlacementIndices(const Object &object, semantics::SemanticsContext &semaCtx) { auto compObj = getComponentObject(object, semaCtx); while (compObj) { - indices.push_back(getComponentPlacementInParent(compObj->id())); + indices.push_back(getComponentPlacementInParent(compObj->sym())); compObj = getComponentObject(getBaseObject(compObj.value(), semaCtx), semaCtx); } @@ -188,7 +188,7 @@ void addChildIndexAndMapToParent( std::map<const semantics::Symbol *, llvm::SmallVector<OmpMapMemberIndicesData>> &parentMemberIndices, mlir::omp::MapInfoOp &mapOp, semantics::SemanticsContext &semaCtx) { - std::optional<evaluate::DataRef> dataRef = ExtractDataRef(object.designator); + std::optional<evaluate::DataRef> dataRef = ExtractDataRef(object.ref()); assert(dataRef.has_value() && "DataRef could not be extracted during mapping of derived type " "cannot proceed"); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 1cd3976..d3f6fa1 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -280,6 +280,10 @@ static constexpr IntrinsicHandler handlers[]{ {"trim_name", asAddr, handleDynamicOptional}, {"errmsg", asBox, handleDynamicOptional}}}, /*isElemental=*/false}, + {"getcwd", + &I::genGetCwd, + {{{"c", asBox}, {"status", asAddr, handleDynamicOptional}}}, + /*isElemental=*/false}, {"getpid", &I::genGetPID}, {"iachar", &I::genIchar}, {"iall", @@ -3476,6 +3480,37 @@ mlir::Value IntrinsicLibrary::genFraction(mlir::Type resultType, fir::runtime::genFraction(builder, loc, fir::getBase(args[0]))); } +// GETCWD +fir::ExtendedValue +IntrinsicLibrary::genGetCwd(std::optional<mlir::Type> resultType, + llvm::ArrayRef<fir::ExtendedValue> args) { + assert((args.size() == 1 && resultType.has_value()) || + (args.size() >= 1 && !resultType.has_value())); + + mlir::Value cwd = fir::getBase(args[0]); + mlir::Value statusValue = fir::runtime::genGetCwd(builder, loc, cwd); + + if (resultType.has_value()) { + // Function form, return status. + return statusValue; + } else { + // Subroutine form, store status and return none. + const fir::ExtendedValue &status = args[1]; + if (!isStaticallyAbsent(status)) { + mlir::Value statusAddr = fir::getBase(status); + mlir::Value statusIsPresentAtRuntime = + builder.genIsNotNullAddr(loc, statusAddr); + builder.genIfThen(loc, statusIsPresentAtRuntime) + .genThen([&]() { + builder.createStoreWithConvert(loc, statusValue, statusAddr); + }) + .end(); + } + } + + return {}; +} + // GET_COMMAND void IntrinsicLibrary::genGetCommand(llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 4); @@ -4965,10 +5000,6 @@ fir::ExtendedValue IntrinsicLibrary::genIsContiguous(mlir::Type resultType, llvm::ArrayRef<fir::ExtendedValue> args) { assert(args.size() == 1); - if (const auto *boxValue = args[0].getBoxOf<fir::BoxValue>()) - if (boxValue->hasAssumedRank()) - TODO(loc, "intrinsic: is_contiguous with assumed rank argument"); - return builder.createConvert( loc, resultType, fir::runtime::genIsContiguous(builder, loc, fir::getBase(args[0]))); diff --git a/flang/lib/Optimizer/Builder/MutableBox.cpp b/flang/lib/Optimizer/Builder/MutableBox.cpp index 76b920d..16e543f 100644 --- a/flang/lib/Optimizer/Builder/MutableBox.cpp +++ b/flang/lib/Optimizer/Builder/MutableBox.cpp @@ -394,6 +394,8 @@ static bool readToBoxValue(const fir::MutableBoxValue &box, // Track value as fir.box if ((box.isDerived() && mayBePolymorphic) || box.isUnlimitedPolymorphic()) return true; + if (box.hasAssumedRank()) + return true; // Intrinsic allocatables are contiguous, no need to track the value by // fir.box. if (box.isAllocatable() || box.rank() == 0) @@ -409,14 +411,12 @@ fir::factory::genMutableBoxRead(fir::FirOpBuilder &builder, mlir::Location loc, const fir::MutableBoxValue &box, bool mayBePolymorphic, bool preserveLowerBounds) { - if (box.hasAssumedRank()) - TODO(loc, "assumed rank allocatables or pointers"); llvm::SmallVector<mlir::Value> lbounds; llvm::SmallVector<mlir::Value> extents; llvm::SmallVector<mlir::Value> lengths; if (readToBoxValue(box, mayBePolymorphic)) { auto reader = MutablePropertyReader(builder, loc, box); - if (preserveLowerBounds) + if (preserveLowerBounds && !box.hasAssumedRank()) reader.getLowerBounds(lbounds); return fir::BoxValue{reader.getIrBox(), lbounds, box.nonDeferredLenParams()}; diff --git a/flang/lib/Optimizer/Builder/Runtime/Command.cpp b/flang/lib/Optimizer/Builder/Runtime/Command.cpp index 1d719e7..8320d89 100644 --- a/flang/lib/Optimizer/Builder/Runtime/Command.cpp +++ b/flang/lib/Optimizer/Builder/Runtime/Command.cpp @@ -88,3 +88,16 @@ mlir::Value fir::runtime::genGetEnvVariable(fir::FirOpBuilder &builder, sourceFile, sourceLine); return builder.create<fir::CallOp>(loc, runtimeFunc, args).getResult(0); } + +mlir::Value fir::runtime::genGetCwd(fir::FirOpBuilder &builder, + mlir::Location loc, mlir::Value cwd) { + mlir::func::FuncOp func = + fir::runtime::getRuntimeFunc<mkRTKey(GetCwd)>(loc, builder); + auto runtimeFuncTy = func.getFunctionType(); + mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc); + mlir::Value sourceLine = + fir::factory::locationToLineNo(builder, loc, runtimeFuncTy.getInput(2)); + llvm::SmallVector<mlir::Value> args = fir::runtime::createArguments( + builder, loc, runtimeFuncTy, cwd, sourceFile, sourceLine); + return builder.create<fir::CallOp>(loc, func, args).getResult(0); +} diff --git a/flang/lib/Optimizer/Dialect/FIRAttr.cpp b/flang/lib/Optimizer/Dialect/FIRAttr.cpp index 2faba63..a0202a0 100644 --- a/flang/lib/Optimizer/Dialect/FIRAttr.cpp +++ b/flang/lib/Optimizer/Dialect/FIRAttr.cpp @@ -297,6 +297,6 @@ void fir::printFirAttribute(FIROpsDialect *dialect, mlir::Attribute attr, void FIROpsDialect::registerAttributes() { addAttributes<ClosedIntervalAttr, ExactTypeAttr, FortranVariableFlagsAttr, - LowerBoundAttr, PointIntervalAttr, RealAttr, SubclassAttr, - UpperBoundAttr>(); + LowerBoundAttr, PointIntervalAttr, RealAttr, ReduceAttr, + SubclassAttr, UpperBoundAttr>(); } diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp index b530a9d..75ca738 100644 --- a/flang/lib/Optimizer/Dialect/FIROps.cpp +++ b/flang/lib/Optimizer/Dialect/FIROps.cpp @@ -2456,9 +2456,16 @@ void fir::DoLoopOp::build(mlir::OpBuilder &builder, mlir::OperationState &result, mlir::Value lb, mlir::Value ub, mlir::Value step, bool unordered, bool finalCountValue, mlir::ValueRange iterArgs, + mlir::ValueRange reduceOperands, + llvm::ArrayRef<mlir::Attribute> reduceAttrs, llvm::ArrayRef<mlir::NamedAttribute> attributes) { result.addOperands({lb, ub, step}); + result.addOperands(reduceOperands); result.addOperands(iterArgs); + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr( + {1, 1, 1, static_cast<int32_t>(reduceOperands.size()), + static_cast<int32_t>(iterArgs.size())})); if (finalCountValue) { result.addTypes(builder.getIndexType()); result.addAttribute(getFinalValueAttrName(result.name), @@ -2477,6 +2484,9 @@ void fir::DoLoopOp::build(mlir::OpBuilder &builder, if (unordered) result.addAttribute(getUnorderedAttrName(result.name), builder.getUnitAttr()); + if (!reduceAttrs.empty()) + result.addAttribute(getReduceAttrsAttrName(result.name), + builder.getArrayAttr(reduceAttrs)); result.addAttributes(attributes); } @@ -2502,24 +2512,51 @@ mlir::ParseResult fir::DoLoopOp::parse(mlir::OpAsmParser &parser, if (mlir::succeeded(parser.parseOptionalKeyword("unordered"))) result.addAttribute("unordered", builder.getUnitAttr()); + // Parse the reduction arguments. + llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> reduceOperands; + llvm::SmallVector<mlir::Type> reduceArgTypes; + if (succeeded(parser.parseOptionalKeyword("reduce"))) { + // Parse reduction attributes and variables. + llvm::SmallVector<ReduceAttr> attributes; + if (failed(parser.parseCommaSeparatedList( + mlir::AsmParser::Delimiter::Paren, [&]() { + if (parser.parseAttribute(attributes.emplace_back()) || + parser.parseArrow() || + parser.parseOperand(reduceOperands.emplace_back()) || + parser.parseColonType(reduceArgTypes.emplace_back())) + return mlir::failure(); + return mlir::success(); + }))) + return mlir::failure(); + // Resolve input operands. + for (auto operand_type : llvm::zip(reduceOperands, reduceArgTypes)) + if (parser.resolveOperand(std::get<0>(operand_type), + std::get<1>(operand_type), result.operands)) + return mlir::failure(); + llvm::SmallVector<mlir::Attribute> arrayAttr(attributes.begin(), + attributes.end()); + result.addAttribute(getReduceAttrsAttrName(result.name), + builder.getArrayAttr(arrayAttr)); + } + // Parse the optional initial iteration arguments. llvm::SmallVector<mlir::OpAsmParser::Argument> regionArgs; - llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> operands; + llvm::SmallVector<mlir::OpAsmParser::UnresolvedOperand> iterOperands; llvm::SmallVector<mlir::Type> argTypes; bool prependCount = false; regionArgs.push_back(inductionVariable); if (succeeded(parser.parseOptionalKeyword("iter_args"))) { // Parse assignment list and results type list. - if (parser.parseAssignmentList(regionArgs, operands) || + if (parser.parseAssignmentList(regionArgs, iterOperands) || parser.parseArrowTypeList(result.types)) return mlir::failure(); - if (result.types.size() == operands.size() + 1) + if (result.types.size() == iterOperands.size() + 1) prependCount = true; // Resolve input operands. llvm::ArrayRef<mlir::Type> resTypes = result.types; - for (auto operand_type : - llvm::zip(operands, prependCount ? resTypes.drop_front() : resTypes)) + for (auto operand_type : llvm::zip( + iterOperands, prependCount ? resTypes.drop_front() : resTypes)) if (parser.resolveOperand(std::get<0>(operand_type), std::get<1>(operand_type), result.operands)) return mlir::failure(); @@ -2530,6 +2567,12 @@ mlir::ParseResult fir::DoLoopOp::parse(mlir::OpAsmParser &parser, prependCount = true; } + // Set the operandSegmentSizes attribute + result.addAttribute(getOperandSegmentSizeAttr(), + builder.getDenseI32ArrayAttr( + {1, 1, 1, static_cast<int32_t>(reduceOperands.size()), + static_cast<int32_t>(iterOperands.size())})); + if (parser.parseOptionalAttrDictWithKeyword(result.attributes)) return mlir::failure(); @@ -2606,6 +2649,10 @@ mlir::LogicalResult fir::DoLoopOp::verify() { i++; } + auto reduceAttrs = getReduceAttrsAttr(); + if (getNumReduceOperands() != (reduceAttrs ? reduceAttrs.size() : 0)) + return emitOpError( + "mismatch in number of reduction variables and reduction attributes"); return mlir::success(); } @@ -2615,6 +2662,17 @@ void fir::DoLoopOp::print(mlir::OpAsmPrinter &p) { << getUpperBound() << " step " << getStep(); if (getUnordered()) p << " unordered"; + if (hasReduceOperands()) { + p << " reduce("; + auto attrs = getReduceAttrsAttr(); + auto operands = getReduceOperands(); + llvm::interleaveComma(llvm::zip(attrs, operands), p, [&](auto it) { + p << std::get<0>(it) << " -> " << std::get<1>(it) << " : " + << std::get<1>(it).getType(); + }); + p << ')'; + printBlockTerminators = true; + } if (hasIterOperands()) { p << " iter_args("; auto regionArgs = getRegionIterArgs(); @@ -2628,8 +2686,9 @@ void fir::DoLoopOp::print(mlir::OpAsmPrinter &p) { p << " -> " << getResultTypes(); printBlockTerminators = true; } - p.printOptionalAttrDictWithKeyword((*this)->getAttrs(), - {"unordered", "finalValue"}); + p.printOptionalAttrDictWithKeyword( + (*this)->getAttrs(), + {"unordered", "finalValue", "reduceAttrs", "operandSegmentSizes"}); p << ' '; p.printRegion(getRegion(), /*printEntryBlockArgs=*/false, printBlockTerminators); diff --git a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp index b265c74..648628fd 100644 --- a/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp +++ b/flang/lib/Optimizer/Transforms/ExternalNameConversion.cpp @@ -45,17 +45,11 @@ namespace { class ExternalNameConversionPass : public fir::impl::ExternalNameConversionBase<ExternalNameConversionPass> { public: - ExternalNameConversionPass(bool appendUnderscoring) - : appendUnderscores(appendUnderscoring) {} - - ExternalNameConversionPass() { usePassOpt = true; } + using ExternalNameConversionBase< + ExternalNameConversionPass>::ExternalNameConversionBase; mlir::ModuleOp getModule() { return getOperation(); } void runOnOperation() override; - -private: - bool appendUnderscores; - bool usePassOpt = false; }; } // namespace @@ -63,7 +57,6 @@ void ExternalNameConversionPass::runOnOperation() { auto op = getOperation(); auto *context = &getContext(); - appendUnderscores = (usePassOpt) ? appendUnderscoreOpt : appendUnderscores; llvm::DenseMap<mlir::StringAttr, mlir::FlatSymbolRefAttr> remappings; // Update names of external Fortran functions and names of Common Block // globals. @@ -74,7 +67,8 @@ void ExternalNameConversionPass::runOnOperation() { mlir::SymbolTable::getSymbolAttrName()); auto deconstructedName = fir::NameUniquer::deconstruct(symName); if (fir::NameUniquer::isExternalFacingUniquedName(deconstructedName)) { - auto newName = mangleExternalName(deconstructedName, appendUnderscores); + auto newName = + mangleExternalName(deconstructedName, appendUnderscoreOpt); auto newAttr = mlir::StringAttr::get(context, newName); mlir::SymbolTable::setSymbolName(&funcOrGlobal, newAttr); auto newSymRef = mlir::FlatSymbolRefAttr::get(newAttr); @@ -101,12 +95,3 @@ void ExternalNameConversionPass::runOnOperation() { nestedOp->setAttr(update.first, update.second); }); } - -std::unique_ptr<mlir::Pass> fir::createExternalNameConversionPass() { - return std::make_unique<ExternalNameConversionPass>(); -} - -std::unique_ptr<mlir::Pass> -fir::createExternalNameConversionPass(bool appendUnderscoring) { - return std::make_unique<ExternalNameConversionPass>(appendUnderscoring); -} diff --git a/flang/runtime/command.cpp b/flang/runtime/command.cpp index b573c5d..e642248a 100644 --- a/flang/runtime/command.cpp +++ b/flang/runtime/command.cpp @@ -17,12 +17,19 @@ #ifdef _WIN32 #include "flang/Common/windows-include.h" +#include <direct.h> +#define getcwd _getcwd +#define PATH_MAX MAX_PATH // On Windows GetCurrentProcessId returns a DWORD aka uint32_t #include <processthreadsapi.h> inline pid_t getpid() { return GetCurrentProcessId(); } #else #include <unistd.h> //getpid() + +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif #endif namespace Fortran::runtime { @@ -239,4 +246,23 @@ std::int32_t RTNAME(GetEnvVariable)(const Descriptor &name, return StatOk; } +std::int32_t RTNAME(GetCwd)( + const Descriptor &cwd, const char *sourceFile, int line) { + Terminator terminator{sourceFile, line}; + + RUNTIME_CHECK(terminator, IsValidCharDescriptor(&cwd)); + + char *buf{(char *)AllocateMemoryOrCrash(terminator, PATH_MAX)}; + + if (!getcwd(buf, PATH_MAX)) { + return StatMissingCurrentWorkDirectory; + } + + std::int64_t strLen{StringLength(buf)}; + std::int32_t status{CopyCharsToDescriptor(cwd, buf, strLen)}; + + std::free(buf); + return status; +} + } // namespace Fortran::runtime diff --git a/flang/runtime/stat.h b/flang/runtime/stat.h index 4f46f52..71faeb0 100644 --- a/flang/runtime/stat.h +++ b/flang/runtime/stat.h @@ -41,6 +41,7 @@ enum Stat { StatLocked = FORTRAN_RUNTIME_STAT_LOCKED, StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE, StatMissingEnvVariable = FORTRAN_RUNTIME_STAT_MISSING_ENV_VAR, + StatMissingCurrentWorkDirectory = FORTRAN_RUNTIME_STAT_MISSING_CWD, StatStoppedImage = FORTRAN_RUNTIME_STAT_STOPPED_IMAGE, StatUnlocked = FORTRAN_RUNTIME_STAT_UNLOCKED, StatUnlockedFailedImage = FORTRAN_RUNTIME_STAT_UNLOCKED_FAILED_IMAGE, diff --git a/flang/test/Fir/loop03.fir b/flang/test/Fir/loop03.fir new file mode 100644 index 0000000..b88dcaf --- /dev/null +++ b/flang/test/Fir/loop03.fir @@ -0,0 +1,17 @@ +// Test the reduction semantics of fir.do_loop +// RUN: fir-opt %s | FileCheck %s + +func.func @reduction() { + %bound = arith.constant 10 : index + %step = arith.constant 1 : index + %sum = fir.alloca i32 +// CHECK: %[[VAL_0:.*]] = fir.alloca i32 +// CHECK: fir.do_loop %[[VAL_1:.*]] = %[[VAL_2:.*]] to %[[VAL_3:.*]] step %[[VAL_4:.*]] unordered reduce(#fir.reduce_attr<add> -> %[[VAL_0]] : !fir.ref<i32>) { + fir.do_loop %iv = %step to %bound step %step unordered reduce(#fir.reduce_attr<add> -> %sum : !fir.ref<i32>) { + %index = fir.convert %iv : (index) -> i32 + %1 = fir.load %sum : !fir.ref<i32> + %2 = arith.addi %index, %1 : i32 + fir.store %2 to %sum : !fir.ref<i32> + } + return +} diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 new file mode 100644 index 0000000..e8610aa --- /dev/null +++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries.f90 @@ -0,0 +1,383 @@ +! Test lowering of inquiry intrinsics with assumed-ranks arguments. +! RUN: bbc -emit-hlfir -o - %s -allow-assumed-rank | FileCheck %s + +subroutine test_allocated(x) + real, allocatable :: x(..) + call takes_logical(allocated(x)) +end subroutine + +subroutine test_associated_1(x) + real, pointer :: x(..) + call takes_logical(associated(x)) +end subroutine + +subroutine test_associated_2(x, y) + real, pointer :: x(..) + real, target :: y(:) + call takes_logical(associated(x, y)) +end subroutine + +subroutine test_associated_3(x, y) + real, pointer :: x(..) + real, pointer :: y(..) + call takes_logical(associated(x, y)) +end subroutine + +subroutine test_len_1(x) + character(*) :: x(..) + call takes_integer(len(x)) +end subroutine + +subroutine test_len_2(x) + character(*), pointer :: x(..) + call takes_integer(len(x)) +end subroutine + +subroutine test_storage_size_1(x) + class(*) :: x(..) + call takes_integer(storage_size(x)) +end subroutine + +subroutine test_storage_size_2(x) + class(*), pointer :: x(..) + call takes_integer(storage_size(x)) +end subroutine + +subroutine test_present_1(x) + class(*), optional :: x(..) + call takes_logical(present(x)) +end subroutine + +subroutine test_present_2(x) + class(*), optional, pointer :: x(..) + call takes_logical(present(x)) +end subroutine + +subroutine test_is_contiguous_1(x) + class(*) :: x(..) + call takes_logical(is_contiguous(x)) +end subroutine + +subroutine test_is_contiguous_2(x) + class(*), pointer :: x(..) + call takes_logical(is_contiguous(x)) +end subroutine + +subroutine test_same_type_as_1(x, y) + class(*) :: x(..), y(..) + call takes_logical(same_type_as(x, y)) +end subroutine + +subroutine test_same_type_as_2(x, y) + class(*), pointer :: x(..), y(..) + call takes_logical(same_type_as(x, y)) +end subroutine + +subroutine test_extends_type_of_1(x, y) + class(*) :: x(..), y(..) + call takes_logical(extends_type_of(x, y)) +end subroutine + +subroutine test_extends_type_of_2(x, y) + class(*), pointer :: x(..), y(..) + call takes_logical(extends_type_of(x, y)) +end subroutine + +subroutine c_loc_1(x) + use iso_c_binding, only : c_loc + real, target :: x(..) + call takes_cloc(c_loc(x)) +end subroutine + +subroutine c_loc_2(x) + use iso_c_binding, only : c_loc + real, pointer :: x(..) + call takes_cloc(c_loc(x)) +end subroutine + +! CHECK-LABEL: func.func @_QPtest_allocated( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_allocatedEx"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<*:f32>>>> +! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<*:f32>>>) -> !fir.heap<!fir.array<*:f32>> +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.heap<!fir.array<*:f32>>) -> i64 +! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64 +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_9]]#1, %[[VAL_9]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_associated_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_1Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> +! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.ptr<!fir.array<*:f32>> +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:f32>>) -> i64 +! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_7:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_6]] : i64 +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_9]]#1, %[[VAL_9]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_associated_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y", fir.target}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtest_associated_2Ey"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>) +! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> +! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none> +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<none> +! CHECK: %[[VAL_8:.*]] = fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_6]], %[[VAL_7]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_10:.*]]:3 = hlfir.associate %[[VAL_9]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_10]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_10]]#1, %[[VAL_10]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_associated_3( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "y"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_associated_3Ey"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> +! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none> +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_5]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none> +! CHECK: %[[VAL_9:.*]] = fir.call @_FortranAPointerIsAssociatedWith(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_10]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_11]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_len_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:!fir.char<1,?>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_len_1Ex"} : (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.dscope) -> (!fir.box<!fir.array<*:!fir.char<1,?>>>, !fir.box<!fir.array<*:!fir.char<1,?>>>) +! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> index +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (index) -> i32 +! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) +! CHECK: fir.call @_QPtakes_integer(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<i32>) -> () +! CHECK: hlfir.end_associate %[[VAL_5]]#1, %[[VAL_5]]#2 : !fir.ref<i32>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_len_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]] = fir.load %[[VAL_0]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>> +! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>) -> index +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_3]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_len_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, index, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:!fir.char<1,?>>>>>) +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]] : (index) -> i32 +! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) +! CHECK: fir.call @_QPtakes_integer(%[[VAL_6]]#1) fastmath<contract> : (!fir.ref<i32>) -> () +! CHECK: hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<i32>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_storage_size_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_storage_size_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_3:.*]] = fir.box_elesize %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> i32 +! CHECK: %[[VAL_4:.*]] = arith.constant 8 : i32 +! CHECK: %[[VAL_5:.*]] = arith.muli %[[VAL_3]], %[[VAL_4]] : i32 +! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) +! CHECK: fir.call @_QPtakes_integer(%[[VAL_6]]#1) fastmath<contract> : (!fir.ref<i32>) -> () +! CHECK: hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<i32>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_storage_size_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_storage_size_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.ptr<!fir.array<*:none>> +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (!fir.ptr<!fir.array<*:none>>) -> i64 +! CHECK: %[[VAL_6:.*]] = arith.constant 0 : i64 +! CHECK: %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_6]] : i64 +! CHECK: fir.if %[[VAL_7]] { +! CHECK: %[[VAL_13:.*]] = fir.call @_FortranAReportFatalUserError +! CHECK: } +! CHECK: %[[VAL_14:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_15:.*]] = fir.box_elesize %[[VAL_14]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> i32 +! CHECK: %[[VAL_16:.*]] = arith.constant 8 : i32 +! CHECK: %[[VAL_17:.*]] = arith.muli %[[VAL_15]], %[[VAL_16]] : i32 +! CHECK: %[[VAL_18:.*]]:3 = hlfir.associate %[[VAL_17]] {adapt.valuebyref} : (i32) -> (!fir.ref<i32>, !fir.ref<i32>, i1) +! CHECK: fir.call @_QPtakes_integer(%[[VAL_18]]#1) fastmath<contract> : (!fir.ref<i32>) -> () +! CHECK: hlfir.end_associate %[[VAL_18]]#1, %[[VAL_18]]#2 : !fir.ref<i32>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_present_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x", fir.optional}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtest_present_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> i1 +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_5]]#1, %[[VAL_5]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_present_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x", fir.optional}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<optional, pointer>, uniq_name = "_QFtest_present_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_3:.*]] = fir.is_present %[[VAL_2]]#1 : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) -> i1 +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_5:.*]]:3 = hlfir.associate %[[VAL_4]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_5]]#1, %[[VAL_5]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_is_contiguous_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {uniq_name = "_QFtest_is_contiguous_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_3:.*]] = fir.convert %[[VAL_2]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> +! CHECK: %[[VAL_4:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_3]]) fastmath<contract> : (!fir.box<none>) -> i1 +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_6:.*]]:3 = hlfir.associate %[[VAL_5]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_6]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_6]]#1, %[[VAL_6]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_is_contiguous_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_is_contiguous_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> +! CHECK: %[[VAL_5:.*]] = fir.call @_FortranAIsContiguous(%[[VAL_4]]) fastmath<contract> : (!fir.box<none>) -> i1 +! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_6]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_7]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_7]]#1, %[[VAL_7]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_same_type_as_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_same_type_as_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> +! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> +! CHECK: %[[VAL_7:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_9]]#1, %[[VAL_9]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_same_type_as_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_same_type_as_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> +! CHECK: %[[VAL_9:.*]] = fir.call @_FortranASameTypeAs(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_10]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_11]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_extends_type_of_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.class<!fir.array<*:none>> {fir.bindc_name = "y"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ex"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {uniq_name = "_QFtest_extends_type_of_1Ey"} : (!fir.class<!fir.array<*:none>>, !fir.dscope) -> (!fir.class<!fir.array<*:none>>, !fir.class<!fir.array<*:none>>) +! CHECK: %[[VAL_5:.*]] = fir.convert %[[VAL_3]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> +! CHECK: %[[VAL_6:.*]] = fir.convert %[[VAL_4]]#1 : (!fir.class<!fir.array<*:none>>) -> !fir.box<none> +! CHECK: %[[VAL_7:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_5]], %[[VAL_6]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_9:.*]]:3 = hlfir.associate %[[VAL_8]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_9]]#1, %[[VAL_9]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPtest_extends_type_of_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "x"}, +! CHECK-SAME: %[[VAL_1:.*]]: !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> {fir.bindc_name = "y"}) { +! CHECK: %[[VAL_2:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ex"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_1]] dummy_scope %[[VAL_2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_extends_type_of_2Ey"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.dscope) -> (!fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>, !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>>) +! CHECK: %[[VAL_5:.*]] = fir.load %[[VAL_3]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_6:.*]] = fir.load %[[VAL_4]]#1 : !fir.ref<!fir.class<!fir.ptr<!fir.array<*:none>>>> +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_5]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_6]] : (!fir.class<!fir.ptr<!fir.array<*:none>>>) -> !fir.box<none> +! CHECK: %[[VAL_9:.*]] = fir.call @_FortranAExtendsTypeOf(%[[VAL_7]], %[[VAL_8]]) fastmath<contract> : (!fir.box<none>, !fir.box<none>) -> i1 +! CHECK: %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4> +! CHECK: %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_10]] {adapt.valuebyref} : (!fir.logical<4>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>, i1) +! CHECK: fir.call @_QPtakes_logical(%[[VAL_11]]#1) fastmath<contract> : (!fir.ref<!fir.logical<4>>) -> () +! CHECK: hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<!fir.logical<4>>, i1 +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPc_loc_1( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.box<!fir.array<*:f32>> {fir.bindc_name = "x", fir.target}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFc_loc_1Ex"} : (!fir.box<!fir.array<*:f32>>, !fir.dscope) -> (!fir.box<!fir.array<*:f32>>, !fir.box<!fir.array<*:f32>>) +! CHECK: %[[VAL_3:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> +! CHECK: %[[VAL_4:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> +! CHECK: %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64> +! CHECK: %[[VAL_6:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>> +! CHECK: %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.array<*:f32>>) -> i64 +! CHECK: fir.store %[[VAL_7]] to %[[VAL_5]] : !fir.ref<i64> +! CHECK: %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[VAL_9:.*]] = arith.constant false +! CHECK: %[[VAL_10:.*]] = hlfir.as_expr %[[VAL_8]]#0 move %[[VAL_9]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> +! CHECK: %[[VAL_11:.*]]:3 = hlfir.associate %[[VAL_10]] {adapt.valuebyref} : (!hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1) +! CHECK: fir.call @_QPtakes_cloc(%[[VAL_11]]#1) fastmath<contract> : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> () +! CHECK: hlfir.end_associate %[[VAL_11]]#1, %[[VAL_11]]#2 : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1 +! CHECK: hlfir.destroy %[[VAL_10]] : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> +! CHECK: return +! CHECK: } + +! CHECK-LABEL: func.func @_QPc_loc_2( +! CHECK-SAME: %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> {fir.bindc_name = "x"}) { +! CHECK: %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope +! CHECK: %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] dummy_scope %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFc_loc_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.dscope) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>) +! CHECK: %[[VAL_3:.*]] = fir.load %[[VAL_2]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>> +! CHECK: %[[VAL_4:.*]] = fir.alloca !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> +! CHECK: %[[VAL_5:.*]] = fir.field_index __address, !fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}> +! CHECK: %[[VAL_6:.*]] = fir.coordinate_of %[[VAL_4]], %[[VAL_5]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.field) -> !fir.ref<i64> +! CHECK: %[[VAL_7:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.ptr<!fir.array<*:f32>> +! CHECK: %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (!fir.ptr<!fir.array<*:f32>>) -> i64 +! CHECK: fir.store %[[VAL_8]] to %[[VAL_6]] : !fir.ref<i64> +! CHECK: %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) +! CHECK: %[[VAL_10:.*]] = arith.constant false +! CHECK: %[[VAL_11:.*]] = hlfir.as_expr %[[VAL_9]]#0 move %[[VAL_10]] : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1) -> !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> +! CHECK: %[[VAL_12:.*]]:3 = hlfir.associate %[[VAL_11]] {adapt.valuebyref} : (!hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1) +! CHECK: fir.call @_QPtakes_cloc(%[[VAL_12]]#1) fastmath<contract> : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) -> () +! CHECK: hlfir.end_associate %[[VAL_12]]#1, %[[VAL_12]]#2 : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, i1 +! CHECK: hlfir.destroy %[[VAL_11]] : !hlfir.expr<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>> +! CHECK: return +! CHECK: } diff --git a/flang/test/Lower/Intrinsics/getcwd-function.f90 b/flang/test/Lower/Intrinsics/getcwd-function.f90 new file mode 100644 index 0000000..50b6472 --- /dev/null +++ b/flang/test/Lower/Intrinsics/getcwd-function.f90 @@ -0,0 +1,23 @@ +! Test GETCWD with dynamically optional arguments. +! RUN: bbc -emit-fir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPtest( +! CHECK-SAME: %[[cwdArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "cwd"}) -> i32 { +integer function test(cwd) + CHARACTER(len=255) :: cwd + test = getcwd(cwd) + ! CHECK-NEXT: %[[c8:.*]] = arith.constant 8 : i32 + ! CHECK-NEXT: %[[c255:.*]] = arith.constant 255 : index + ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope + ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) + ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[test:.*]] = fir.alloca i32 {bindc_name = "test", uniq_name = "_QFtestEtest"} + ! CHECK-NEXT: %[[testAddr:.*]] = fir.declare %[[test]] {uniq_name = "_QFtestEtest"} : (!fir.ref<i32>) -> !fir.ref<i32> + ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> + ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> + ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_9:.*]], %[[c8]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 + ! CHECK-NEXT: fir.store %[[statusValue]] to %[[testAddr]] : !fir.ref<i32> + ! CHECK-NEXT: %[[returnValue:.*]] = fir.load %[[testAddr]] : !fir.ref<i32> + ! CHECK-NEXT: return %[[returnValue]] : i32 +end function diff --git a/flang/test/Lower/Intrinsics/getcwd-optional.f90 b/flang/test/Lower/Intrinsics/getcwd-optional.f90 new file mode 100644 index 0000000..3e2a221 --- /dev/null +++ b/flang/test/Lower/Intrinsics/getcwd-optional.f90 @@ -0,0 +1,29 @@ +! Test GETCWD with dynamically optional arguments. +! RUN: bbc -emit-fir %s -o - | FileCheck %s + + +! CHECK-LABEL: func.func @_QPtest( +! CHECK-SAME: %[[cwdArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "cwd"}, +! CHECK-SAME: %[[statusArg:.*]]: !fir.ref<i32> {fir.bindc_name = "status", fir.optional}) { +subroutine test(cwd, status) + CHARACTER(len=255) :: cwd + INTEGER, OPTIONAL :: status + call getcwd(cwd, status) + ! CHECK-NEXT: %[[c0:.*]] = arith.constant 0 : i64 + ! CHECK-NEXT: %[[c11:.*]] = arith.constant 11 : i32 + ! CHECK-NEXT: %[[c255:.*]] = arith.constant 255 : index + ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope + ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) + ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFtestEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %[[DSCOPE]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFtestEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> + ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> + ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> + ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c11]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 + ! CHECK-NEXT: %[[statusCast:.*]] = fir.convert %[[statusAddr]] : (!fir.ref<i32>) -> i64 + ! CHECK-NEXT: %[[isPresent:.*]] = arith.cmpi ne, %[[statusCast]], %[[c0]] : i64 + ! CHECK-NEXT: fir.if %[[isPresent]] { + ! CHECK-NEXT: fir.store %[[statusValue]] to %[[statusAddr]] : !fir.ref<i32> + ! CHECK-NEXT: } + ! CHECK-NEXT: return +end subroutine diff --git a/flang/test/Lower/Intrinsics/getcwd.f90 b/flang/test/Lower/Intrinsics/getcwd.f90 new file mode 100644 index 0000000..fe20785 --- /dev/null +++ b/flang/test/Lower/Intrinsics/getcwd.f90 @@ -0,0 +1,44 @@ +! RUN: bbc -emit-fir %s -o - | FileCheck %s + +! CHECK-LABEL: func.func @_QPcwd_only( +! CHECK-SAME: %[[cwdArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "cwd"}) { +subroutine cwd_only(cwd) + CHARACTER(len=255) :: cwd + call getcwd(cwd) + ! CHECK-NEXT: %[[c7:.*]] = arith.constant 7 : i32 + ! CHECK-NEXT: %[[c255:.*]] = arith.constant 255 : index + ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope + ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) + ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFcwd_onlyEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> + ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> + ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_7:.*]], %[[c7]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 + ! CHECK-NEXT: return +end subroutine cwd_only + +! CHECK-LABEL: func.func @_QPall_arguments( +! CHECK-SAME: %[[cwdArg:.*]]: !fir.boxchar<1> {fir.bindc_name = "cwd"}, +! CHECK-SAME: %[[statusArg:.*]]: !fir.ref<i32> {fir.bindc_name = "status"}) { +subroutine all_arguments(cwd, status) + CHARACTER(len=255) :: cwd + INTEGER :: status + call getcwd(cwd, status) + ! CHECK-NEXT: %[[c0:.*]] = arith.constant 0 : i64 + ! CHECK-NEXT: %[[c26:.*]] = arith.constant 26 : i32 + ! CHECK-NEXT: %[[c255:.*]] = arith.constant 255 : index + ! CHECK-NEXT: %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope + ! CHECK-NEXT: %[[cwdUnbox:.*]]:2 = fir.unboxchar %[[cwdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index) + ! CHECK-NEXT: %[[cwdCast:.*]] = fir.convert %[[cwdUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[cwdDeclare:.*]] = fir.declare %[[cwdCast]] typeparams %[[c255]] dummy_scope %[[DSCOPE]] {uniq_name = "_QFall_argumentsEcwd"} : (!fir.ref<!fir.char<1,255>>, index, !fir.dscope) -> !fir.ref<!fir.char<1,255>> + ! CHECK-NEXT: %[[statusAddr:.*]] = fir.declare %[[statusArg]] dummy_scope %0 {uniq_name = "_QFall_argumentsEstatus"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32> + ! CHECK-NEXT: %[[cwdBox:.*]] = fir.embox %[[cwdDeclare]] : (!fir.ref<!fir.char<1,255>>) -> !fir.box<!fir.char<1,255>> + ! CHECK: %[[cwd:.*]] = fir.convert %[[cwdBox]] : (!fir.box<!fir.char<1,255>>) -> !fir.box<none> + ! CHECK: %[[statusValue:.*]] = fir.call @_FortranAGetCwd(%[[cwd]], %[[VAL_8:.*]], %[[c26]]) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32) -> i32 + ! CHECK-NEXT: %[[statusCast:.*]] = fir.convert %[[statusAddr]] : (!fir.ref<i32>) -> i64 + ! CHECK-NEXT: %[[isPresent:.*]] = arith.cmpi ne, %[[statusCast]], %[[c0]] : i64 + ! CHECK-NEXT: fir.if %[[isPresent]] { + ! CHECK-NEXT: fir.store %[[statusValue]] to %[[statusAddr]] : !fir.ref<i32> + ! CHECK-NEXT: } + ! CHECK-NEXT: return +end subroutine all_arguments
\ No newline at end of file diff --git a/flang/test/Lower/OpenMP/critical.f90 b/flang/test/Lower/OpenMP/critical.f90 index d62c58b..c52ae688 100644 --- a/flang/test/Lower/OpenMP/critical.f90 +++ b/flang/test/Lower/OpenMP/critical.f90 @@ -51,3 +51,27 @@ subroutine predetermined_privatization() end do !$omp end parallel do end + +! https://github.com/llvm/llvm-project/issues/75767 +!CHECK-LABEL: func @_QPparallel_critical_privatization( +subroutine parallel_critical_privatization() + integer :: i + + !CHECK: %[[I:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFparallel_critical_privatizationEi"} + !CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFparallel_critical_privatizationEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: omp.parallel { + !CHECK: %[[PRIV_I:.*]] = fir.alloca i32 {bindc_name = "i", pinned, uniq_name = "_QFparallel_critical_privatizationEi"} + !CHECK: %[[PRIV_I_DECL:.*]]:2 = hlfir.declare %[[PRIV_I]] {uniq_name = "_QFparallel_critical_privatizationEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>) + !CHECK: %[[TEMP:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32> + !CHECK: hlfir.assign %[[TEMP]] to %[[PRIV_I_DECL]]#0 temporary_lhs : i32, !fir.ref<i32> + !$omp parallel default(firstprivate) + !CHECK: omp.critical { + !$omp critical + !CHECK: %[[C200:.*]] = arith.constant 200 : i32 + !CHECK: hlfir.assign %[[C200]] to %[[PRIV_I_DECL]]#0 : i32, !fir.ref<i32> + i = 200 + !CHECK: } + !$omp end critical + !CHECK: } + !$omp end parallel +end subroutine diff --git a/flang/test/Lower/OpenMP/map-component-ref.f90 b/flang/test/Lower/OpenMP/map-component-ref.f90 index 2c58266..21b56ab 100644 --- a/flang/test/Lower/OpenMP/map-component-ref.f90 +++ b/flang/test/Lower/OpenMP/map-component-ref.f90 @@ -1,21 +1,22 @@ ! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s ! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s -! CHECK: %[[V0:[0-9]+]] = fir.alloca !fir.type<_QFfooTt0{a0:i32,a1:i32}> {bindc_name = "a", uniq_name = "_QFfooEa"} -! CHECK: %[[V1:[0-9]+]]:2 = hlfir.declare %[[V0]] {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -! CHECK: %[[V2:[0-9]+]] = hlfir.designate %[[V1]]#0{"a1"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<i32> +! CHECK-LABEL: func.func @_QPfoo1 +! CHECK: %[[V0:[0-9]+]] = fir.alloca !fir.type<_QFfoo1Tt0{a0:i32,a1:i32}> {bindc_name = "a", uniq_name = "_QFfoo1Ea"} +! CHECK: %[[V1:[0-9]+]]:2 = hlfir.declare %[[V0]] {uniq_name = "_QFfoo1Ea"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) +! CHECK: %[[V2:[0-9]+]] = hlfir.designate %[[V1]]#0{"a1"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> !fir.ref<i32> ! CHECK: %[[V3:[0-9]+]] = omp.map.info var_ptr(%[[V2]] : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "a%a1"} -! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.type<_QFfooTt0{a0:i32,a1:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>> {name = "a", partial_map = true} -! CHECK: omp.target map_entries(%[[V3]] -> %arg0, %[[V4]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) { -! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>): -! CHECK: %[[V5:[0-9]+]]:2 = hlfir.declare %arg1 {uniq_name = "_QFfooEa"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) +! CHECK: %[[V4:[0-9]+]] = omp.map.info var_ptr(%[[V1]]#1 : !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>) map_clauses(tofrom) capture(ByRef) members(%[[V3]] : [1] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>> {name = "a", partial_map = true} +! CHECK: omp.target map_entries(%[[V3]] -> %arg0, %[[V4]] -> %arg1 : !fir.ref<i32>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) { +! CHECK: ^bb0(%arg0: !fir.ref<i32>, %arg1: !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>): +! CHECK: %[[V5:[0-9]+]]:2 = hlfir.declare %arg1 {uniq_name = "_QFfoo1Ea"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>, !fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) ! CHECK: %c0_i32 = arith.constant 0 : i32 -! CHECK: %[[V6:[0-9]+]] = hlfir.designate %[[V5]]#0{"a1"} : (!fir.ref<!fir.type<_QFfooTt0{a0:i32,a1:i32}>>) -> !fir.ref<i32> +! CHECK: %[[V6:[0-9]+]] = hlfir.designate %[[V5]]#0{"a1"} : (!fir.ref<!fir.type<_QFfoo1Tt0{a0:i32,a1:i32}>>) -> !fir.ref<i32> ! CHECK: hlfir.assign %c0_i32 to %[[V6]] : i32, !fir.ref<i32> ! CHECK: omp.terminator ! CHECK: } -subroutine foo() +subroutine foo1() implicit none type t0 @@ -29,3 +30,25 @@ subroutine foo() !$omp end target end + +! CHECK-LABEL: func.func @_QPfoo2 +! CHECK-DAG: omp.map.info var_ptr(%{{[0-9]+}} : {{.*}} map_clauses(to) capture(ByRef) bounds(%{{[0-9]+}}) -> {{.*}} {name = "t%b(1_8)%a(1)"} +! CHECK-DAG: omp.map.info var_ptr(%{{[0-9]+}} : {{.*}} map_clauses(from) capture(ByRef) bounds(%{{[0-9]+}}) -> {{.*}} {name = "u%b(1_8)%a(1)"} +subroutine foo2() + implicit none + + type t0 + integer :: a(10) + end type + + type t1 + type(t0) :: b(10) + end type + + type(t1) :: t, u + +!$omp target map(to: t%b(1)%a(1)) map(from: u%b(1)%a(1)) + t%b(1)%a(1) = u%b(1)%a(1) +!$omp end target + +end diff --git a/flang/test/Semantics/getcwd.f90 b/flang/test/Semantics/getcwd.f90 new file mode 100644 index 0000000..b6ff16e --- /dev/null +++ b/flang/test/Semantics/getcwd.f90 @@ -0,0 +1,35 @@ +! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic +! Tests for the GETCWD intrinsics + +subroutine bad_kind_error(cwd, status) + CHARACTER(len=255) :: cwd + INTEGER(2) :: status + !ERROR: Actual argument for 'status=' has bad type or kind 'INTEGER(2)' + call getcwd(cwd, status) +end subroutine bad_kind_error + +subroutine bad_args_error() + !ERROR: missing mandatory 'c=' argument + call getcwd() +end subroutine bad_args_error + +subroutine bad_apply_form(cwd) + CHARACTER(len=255) :: cwd + INTEGER :: status + !Declaration of 'getcwd' + call getcwd(cwd, status) + !ERROR: Cannot call subroutine 'getcwd' like a function + status = getcwd(cwd) +end subroutine bad_apply_form + +subroutine good_subroutine(cwd, status) + CHARACTER(len=255) :: cwd + INTEGER :: status + call getcwd(cwd, status) +end subroutine good_subroutine + +subroutine good_function(cwd, status) + CHARACTER(len=255) :: cwd + INTEGER :: status + status = getcwd(cwd) +end subroutine good_function
\ No newline at end of file diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt index b678350..2217a69 100644 --- a/libc/config/gpu/entrypoints.txt +++ b/libc/config/gpu/entrypoints.txt @@ -1,3 +1,13 @@ +if(LIBC_TARGET_ARCHITECTURE_IS_AMDGPU) + set(extra_entrypoints + # stdio.h entrypoints + libc.src.stdio.sprintf + libc.src.stdio.snprintf + libc.src.stdio.vsprintf + libc.src.stdio.vsnprintf + ) +endif() + set(TARGET_LIBC_ENTRYPOINTS # assert.h entrypoints libc.src.assert.__assert_fail @@ -175,6 +185,7 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.errno.errno # stdio.h entrypoints + ${extra_entrypoints} libc.src.stdio.feof libc.src.stdio.ferror libc.src.stdio.fseek diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt index c990a5b..33ecff8 100644 --- a/libc/config/linux/aarch64/entrypoints.txt +++ b/libc/config/linux/aarch64/entrypoints.txt @@ -505,6 +505,16 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.fabsf16 libc.src.math.fdimf16 libc.src.math.floorf16 + libc.src.math.fmaxf16 + libc.src.math.fmaximumf16 + libc.src.math.fmaximum_magf16 + libc.src.math.fmaximum_mag_numf16 + libc.src.math.fmaximum_numf16 + libc.src.math.fminf16 + libc.src.math.fminimumf16 + libc.src.math.fminimum_magf16 + libc.src.math.fminimum_mag_numf16 + libc.src.math.fminimum_numf16 libc.src.math.fromfpf16 libc.src.math.fromfpxf16 libc.src.math.llrintf16 @@ -512,6 +522,13 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.lrintf16 libc.src.math.lroundf16 libc.src.math.nearbyintf16 + libc.src.math.nextafterf16 + libc.src.math.nextdownf16 + # Temporarily disable nexttowardf16 on aarch64 because the conversion + # between _Float16 and long double will crash clang-11. This is fixed in + # clang-12 and after: https://godbolt.org/z/8ceT9454c + # libc.src.math.nexttowardf16 + libc.src.math.nextupf16 libc.src.math.rintf16 libc.src.math.roundf16 libc.src.math.roundevenf16 diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt index 780ffb6..e3ca544 100644 --- a/libc/config/linux/x86_64/entrypoints.txt +++ b/libc/config/linux/x86_64/entrypoints.txt @@ -173,7 +173,6 @@ set(TARGET_LIBC_ENTRYPOINTS libc.src.stdlib.atoll libc.src.stdlib.bsearch libc.src.stdlib.div - libc.src.stdlib.quick_exit libc.src.stdlib.labs libc.src.stdlib.ldiv libc.src.stdlib.llabs @@ -538,6 +537,16 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.fabsf16 libc.src.math.fdimf16 libc.src.math.floorf16 + libc.src.math.fmaxf16 + libc.src.math.fmaximumf16 + libc.src.math.fmaximum_magf16 + libc.src.math.fmaximum_mag_numf16 + libc.src.math.fmaximum_numf16 + libc.src.math.fminf16 + libc.src.math.fminimumf16 + libc.src.math.fminimum_magf16 + libc.src.math.fminimum_mag_numf16 + libc.src.math.fminimum_numf16 libc.src.math.fromfpf16 libc.src.math.fromfpxf16 libc.src.math.llrintf16 @@ -545,6 +554,10 @@ if(LIBC_TYPES_HAS_FLOAT16) libc.src.math.lrintf16 libc.src.math.lroundf16 libc.src.math.nearbyintf16 + libc.src.math.nextafterf16 + libc.src.math.nextdownf16 + libc.src.math.nexttowardf16 + libc.src.math.nextupf16 libc.src.math.rintf16 libc.src.math.roundf16 libc.src.math.roundevenf16 @@ -756,9 +769,11 @@ if(LLVM_LIBC_FULL_BUILD) # stdlib.h entrypoints libc.src.stdlib._Exit libc.src.stdlib.abort + libc.src.stdlib.at_quick_exit libc.src.stdlib.atexit libc.src.stdlib.exit libc.src.stdlib.getenv + libc.src.stdlib.quick_exit # signal.h entrypoints libc.src.signal.raise diff --git a/libc/docs/c23.rst b/libc/docs/c23.rst index 5bbb056..71232cc 100644 --- a/libc/docs/c23.rst +++ b/libc/docs/c23.rst @@ -59,15 +59,17 @@ Additions: * ufromfp* |check| * fromfpx* |check| * ufromfpx* |check| - * nextup* - * nextdown* + * nextup* |check| + * nextdown* |check| * canonicalize* |check| - * fmaximum* - * fminimum* - * fmaximum_mag* - * fminimum_mag* - * fmaximum_mag_num* - * fminimum_mag_num* + * fmaximum* |check| + * fminimum* |check| + * fmaximum_mag* |check| + * fminimum_mag* |check| + * fmaximum_num* |check| + * fminimum_num* |check| + * fmaximum_mag_num* |check| + * fminimum_mag_num* |check| * fadd* * fsub* * fmul* diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst index fd75374..b9507f0 100644 --- a/libc/docs/math/index.rst +++ b/libc/docs/math/index.rst @@ -136,25 +136,25 @@ Basic Operations +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | floor | |check| | |check| | |check| | |check| | |check| | 7.12.9.2 | F.10.6.2 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmax | |check| | |check| | |check| | | |check| | 7.12.12.2 | F.10.9.2 | +| fmax | |check| | |check| | |check| | |check| | |check| | 7.12.12.2 | F.10.9.2 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmaximum | |check| | |check| | |check| | | |check| | 7.12.12.4 | F.10.9.4 | +| fmaximum | |check| | |check| | |check| | |check| | |check| | 7.12.12.4 | F.10.9.4 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmaximum_mag | |check| | |check| | |check| | | |check| | 7.12.12.6 | F.10.9.4 | +| fmaximum_mag | |check| | |check| | |check| | |check| | |check| | 7.12.12.6 | F.10.9.4 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmaximum_mag_num | |check| | |check| | |check| | | |check| | 7.12.12.10 | F.10.9.5 | +| fmaximum_mag_num | |check| | |check| | |check| | |check| | |check| | 7.12.12.10 | F.10.9.5 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmaximum_num | |check| | |check| | |check| | | |check| | 7.12.12.8 | F.10.9.5 | +| fmaximum_num | |check| | |check| | |check| | |check| | |check| | 7.12.12.8 | F.10.9.5 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fmin | |check| | |check| | |check| | | |check| | 7.12.12.3 | F.10.9.3 | +| fmin | |check| | |check| | |check| | |check| | |check| | 7.12.12.3 | F.10.9.3 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fminimum | |check| | |check| | |check| | | |check| | 7.12.12.5 | F.10.9.4 | +| fminimum | |check| | |check| | |check| | |check| | |check| | 7.12.12.5 | F.10.9.4 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fminimum_mag | |check| | |check| | |check| | | |check| | 7.12.12.7 | F.10.9.4 | +| fminimum_mag | |check| | |check| | |check| | |check| | |check| | 7.12.12.7 | F.10.9.4 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fminimum_mag_num | |check| | |check| | |check| | | |check| | 7.12.12.11 | F.10.9.5 | +| fminimum_mag_num | |check| | |check| | |check| | |check| | |check| | 7.12.12.11 | F.10.9.5 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| fminimum_num | |check| | |check| | |check| | | |check| | 7.12.12.9 | F.10.9.5 | +| fminimum_num | |check| | |check| | |check| | |check| | |check| | 7.12.12.9 | F.10.9.5 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | fmod | |check| | |check| | |check| | | |check| | 7.12.10.1 | F.10.7.1 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ @@ -190,13 +190,13 @@ Basic Operations +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | nearbyint | |check| | |check| | |check| | |check| | |check| | 7.12.9.3 | F.10.6.3 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| nextafter | |check| | |check| | |check| | | |check| | 7.12.11.3 | F.10.8.3 | +| nextafter | |check| | |check| | |check| | |check| | |check| | 7.12.11.3 | F.10.8.3 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| nextdown | |check| | |check| | |check| | | |check| | 7.12.11.6 | F.10.8.6 | +| nextdown | |check| | |check| | |check| | |check| | |check| | 7.12.11.6 | F.10.8.6 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| nexttoward | |check| | |check| | |check| | | N/A | 7.12.11.4 | F.10.8.4 | +| nexttoward | |check| | |check| | |check| | |check| | N/A | 7.12.11.4 | F.10.8.4 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ -| nextup | |check| | |check| | |check| | | |check| | 7.12.11.5 | F.10.8.5 | +| nextup | |check| | |check| | |check| | |check| | |check| | 7.12.11.5 | F.10.8.5 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ | remainder | |check| | |check| | |check| | | | 7.12.10.2 | F.10.7.2 | +------------------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+ diff --git a/libc/hdr/types/CMakeLists.txt b/libc/hdr/types/CMakeLists.txt index 8e87642..9b3373a 100644 --- a/libc/hdr/types/CMakeLists.txt +++ b/libc/hdr/types/CMakeLists.txt @@ -117,3 +117,12 @@ add_proxy_header_library( libc.include.llvm-libc-types.pid_t libc.include.sys_types ) + +add_proxy_header_library( + atexithandler_t + HDRS + atexithandler_t.h + FULL_BUILD_DEPENDS + libc.include.llvm-libc-types.atexithandler_t + libc.include.stdlib +) diff --git a/libc/hdr/types/atexithandler_t.h b/libc/hdr/types/atexithandler_t.h new file mode 100644 index 0000000..4275e44 --- /dev/null +++ b/libc/hdr/types/atexithandler_t.h @@ -0,0 +1,22 @@ +//===-- Definition of macros from atexithandler_t.h -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_ATEXITHANDLER_T_H +#define LLVM_LIBC_HDR_ATEXITHANDLER_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/__atexithandler_t.h" + +#else // overlay mode + +#error // type not available in overlay mode + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_ATEXITHANDLER_T_H diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td index 0aadeb1..9a436c8 100644 --- a/libc/spec/stdc.td +++ b/libc/spec/stdc.td @@ -416,50 +416,60 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"fminf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fminl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, GuardedFunctionSpec<"fminf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, + GuardedFunctionSpec<"fminf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"fmax", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fmaxf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaxl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, GuardedFunctionSpec<"fmaxf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, + GuardedFunctionSpec<"fmaxf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"fmaximum", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fmaximumf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaximuml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fmaximumf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fmaximumf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fmaximum_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fmaximum_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaximum_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fmaximum_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fmaximum_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fmaximum_mag", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fmaximum_magf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaximum_magl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fmaximum_magf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fmaximum_magf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fmaximum_mag_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fmaximum_mag_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaximum_mag_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fmaximum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fmaximum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fminimum", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fminimumf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fminimuml", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fminimumf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fminimumf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fminimum_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fminimum_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fmaximum_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fminimum_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fminimum_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fminimum_mag", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fminimum_magf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fminimum_magl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fminimum_magf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fminimum_magf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fminimum_mag_num", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"fminimum_mag_numf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"fminimum_mag_numl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"fminimum_mag_numf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"fminimum_mag_numf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"fma", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, @@ -632,20 +642,24 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"nextafterf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, FunctionSpec<"nextafter", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>, FunctionSpec<"nextafterl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"nextafterf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"nextafterf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>, ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"nexttowardf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<LongDoubleType>]>, FunctionSpec<"nexttoward", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<LongDoubleType>]>, FunctionSpec<"nexttowardl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"nexttowardf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>, ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, FunctionSpec<"nextdown", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>, FunctionSpec<"nextdownf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>, FunctionSpec<"nextdownl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"nextdownf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"nextdownf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"nextup", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>, FunctionSpec<"nextupf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>, FunctionSpec<"nextupl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>, + GuardedFunctionSpec<"nextupf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">, GuardedFunctionSpec<"nextupf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>], "LIBC_TYPES_HAS_FLOAT128">, FunctionSpec<"powf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>, @@ -1093,8 +1107,9 @@ def StdC : StandardSpec<"stdc"> { FunctionSpec<"free", RetValSpec<VoidType>, [ArgSpec<VoidPtr>]>, FunctionSpec<"_Exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>, - FunctionSpec<"exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>, + FunctionSpec<"at_quick_exit", RetValSpec<IntType>, [ArgSpec<AtexitHandlerT>]>, FunctionSpec<"atexit", RetValSpec<IntType>, [ArgSpec<AtexitHandlerT>]>, + FunctionSpec<"exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>, FunctionSpec<"quick_exit", RetValSpec<NoReturn>, [ArgSpec<IntType>]>, ] >; diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt index 08661ab..e6f58b7 100644 --- a/libc/src/__support/CPP/CMakeLists.txt +++ b/libc/src/__support/CPP/CMakeLists.txt @@ -111,6 +111,7 @@ add_header_library( type_traits/add_lvalue_reference.h type_traits/add_pointer.h type_traits/add_rvalue_reference.h + type_traits/aligned_storage.h type_traits/always_false.h type_traits/bool_constant.h type_traits/conditional.h diff --git a/libc/src/__support/CPP/type_traits.h b/libc/src/__support/CPP/type_traits.h index 1494aeb..d50b661 100644 --- a/libc/src/__support/CPP/type_traits.h +++ b/libc/src/__support/CPP/type_traits.h @@ -12,6 +12,7 @@ #include "src/__support/CPP/type_traits/add_lvalue_reference.h" #include "src/__support/CPP/type_traits/add_pointer.h" #include "src/__support/CPP/type_traits/add_rvalue_reference.h" +#include "src/__support/CPP/type_traits/aligned_storage.h" #include "src/__support/CPP/type_traits/bool_constant.h" #include "src/__support/CPP/type_traits/conditional.h" #include "src/__support/CPP/type_traits/decay.h" diff --git a/libc/src/__support/CPP/type_traits/aligned_storage.h b/libc/src/__support/CPP/type_traits/aligned_storage.h new file mode 100644 index 0000000..574b114 --- /dev/null +++ b/libc/src/__support/CPP/type_traits/aligned_storage.h @@ -0,0 +1,27 @@ +//===-- aligned_storage type_traits --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_ALIGNED_STORAGE_H +#define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_ALIGNED_STORAGE_H + +#include <stddef.h> // size_t + +namespace LIBC_NAMESPACE::cpp { + +template <size_t Len, size_t Align> struct aligned_storage { + struct type { + alignas(Align) unsigned char data[Len]; + }; +}; + +template <size_t Len, size_t Align> +using aligned_storage_t = typename aligned_storage<Len, Align>::type; + +} // namespace LIBC_NAMESPACE::cpp + +#endif // LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_ALIGNED_STORAGE_H diff --git a/libc/src/__support/fixedvector.h b/libc/src/__support/fixedvector.h index 6aeb4d5..ddd0993 100644 --- a/libc/src/__support/fixedvector.h +++ b/libc/src/__support/fixedvector.h @@ -24,6 +24,17 @@ template <typename T, size_t CAPACITY> class FixedVector { public: constexpr FixedVector() = default; + using iterator = typename cpp::array<T, CAPACITY>::iterator; + constexpr FixedVector(iterator begin, iterator end) { + for (; begin != end; ++begin) + push_back(*begin); + } + + constexpr FixedVector(size_t count, const T &value) { + for (size_t i = 0; i < count; ++i) + push_back(value); + } + bool push_back(const T &obj) { if (item_count == CAPACITY) return false; @@ -43,8 +54,14 @@ public: return true; } + T &operator[](size_t idx) { return store[idx]; } + + const T &operator[](size_t idx) const { return store[idx]; } + bool empty() const { return item_count == 0; } + size_t size() const { return item_count; } + // Empties the store for all practical purposes. void reset() { item_count = 0; } @@ -64,7 +81,6 @@ public: } LIBC_INLINE constexpr reverse_iterator rend() { return store.rend(); } - using iterator = typename cpp::array<T, CAPACITY>::iterator; LIBC_INLINE constexpr iterator begin() { return store.begin(); } LIBC_INLINE constexpr iterator end() { return iterator{&store[item_count]}; } }; diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt index 5ae03b1..7a349dd 100644 --- a/libc/src/math/CMakeLists.txt +++ b/libc/src/math/CMakeLists.txt @@ -124,50 +124,60 @@ add_math_entrypoint_object(fmax) add_math_entrypoint_object(fmaxf) add_math_entrypoint_object(fmaxl) add_math_entrypoint_object(fmaxf128) +add_math_entrypoint_object(fmaxf16) add_math_entrypoint_object(fmin) add_math_entrypoint_object(fminf) add_math_entrypoint_object(fminl) add_math_entrypoint_object(fminf128) +add_math_entrypoint_object(fminf16) add_math_entrypoint_object(fmaximum) add_math_entrypoint_object(fmaximumf) add_math_entrypoint_object(fmaximuml) +add_math_entrypoint_object(fmaximumf16) add_math_entrypoint_object(fmaximumf128) add_math_entrypoint_object(fmaximum_num) add_math_entrypoint_object(fmaximum_numf) add_math_entrypoint_object(fmaximum_numl) +add_math_entrypoint_object(fmaximum_numf16) add_math_entrypoint_object(fmaximum_numf128) add_math_entrypoint_object(fmaximum_mag) add_math_entrypoint_object(fmaximum_magf) add_math_entrypoint_object(fmaximum_magl) +add_math_entrypoint_object(fmaximum_magf16) add_math_entrypoint_object(fmaximum_magf128) add_math_entrypoint_object(fmaximum_mag_num) add_math_entrypoint_object(fmaximum_mag_numf) add_math_entrypoint_object(fmaximum_mag_numl) +add_math_entrypoint_object(fmaximum_mag_numf16) add_math_entrypoint_object(fmaximum_mag_numf128) add_math_entrypoint_object(fminimum) add_math_entrypoint_object(fminimumf) add_math_entrypoint_object(fminimuml) +add_math_entrypoint_object(fminimumf16) add_math_entrypoint_object(fminimumf128) add_math_entrypoint_object(fminimum_num) add_math_entrypoint_object(fminimum_numf) add_math_entrypoint_object(fminimum_numl) +add_math_entrypoint_object(fminimum_numf16) add_math_entrypoint_object(fminimum_numf128) add_math_entrypoint_object(fminimum_mag) add_math_entrypoint_object(fminimum_magf) add_math_entrypoint_object(fminimum_magl) +add_math_entrypoint_object(fminimum_magf16) add_math_entrypoint_object(fminimum_magf128) add_math_entrypoint_object(fminimum_mag_num) add_math_entrypoint_object(fminimum_mag_numf) add_math_entrypoint_object(fminimum_mag_numl) +add_math_entrypoint_object(fminimum_mag_numf16) add_math_entrypoint_object(fminimum_mag_numf128) add_math_entrypoint_object(fmod) @@ -270,20 +280,24 @@ add_math_entrypoint_object(nearbyintf128) add_math_entrypoint_object(nextafter) add_math_entrypoint_object(nextafterf) add_math_entrypoint_object(nextafterl) +add_math_entrypoint_object(nextafterf16) add_math_entrypoint_object(nextafterf128) add_math_entrypoint_object(nexttoward) add_math_entrypoint_object(nexttowardf) add_math_entrypoint_object(nexttowardl) +add_math_entrypoint_object(nexttowardf16) add_math_entrypoint_object(nextdown) add_math_entrypoint_object(nextdownf) add_math_entrypoint_object(nextdownl) +add_math_entrypoint_object(nextdownf16) add_math_entrypoint_object(nextdownf128) add_math_entrypoint_object(nextup) add_math_entrypoint_object(nextupf) add_math_entrypoint_object(nextupl) +add_math_entrypoint_object(nextupf16) add_math_entrypoint_object(nextupf128) add_math_entrypoint_object(pow) diff --git a/libc/src/math/fmaxf16.h b/libc/src/math/fmaxf16.h new file mode 100644 index 0000000..bf608f8 --- /dev/null +++ b/libc/src/math/fmaxf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmaxf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXF16_H +#define LLVM_LIBC_SRC_MATH_FMAXF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fmaxf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMAXF16_H diff --git a/libc/src/math/fmaximum_mag_numf16.h b/libc/src/math/fmaximum_mag_numf16.h new file mode 100644 index 0000000..4c963d4 --- /dev/null +++ b/libc/src/math/fmaximum_mag_numf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmaximum_mag_numf16 -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXIMUM_MAG_NUMF16_H +#define LLVM_LIBC_SRC_MATH_FMAXIMUM_MAG_NUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fmaximum_mag_numf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMAXIMUM_MAG_NUMF16_H diff --git a/libc/src/math/fmaximum_magf16.h b/libc/src/math/fmaximum_magf16.h new file mode 100644 index 0000000..e5f57d3 --- /dev/null +++ b/libc/src/math/fmaximum_magf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmaximum_magf16 ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXIMUM_MAGF16_H +#define LLVM_LIBC_SRC_MATH_FMAXIMUM_MAGF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fmaximum_magf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMAXIMUM_MAGF16_H diff --git a/libc/src/math/fmaximum_numf16.h b/libc/src/math/fmaximum_numf16.h new file mode 100644 index 0000000..b450a45 --- /dev/null +++ b/libc/src/math/fmaximum_numf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmaximum_numf16 ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXIMUM_NUMF16_H +#define LLVM_LIBC_SRC_MATH_FMAXIMUM_NUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fmaximum_numf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMAXIMUM_NUMF16_H diff --git a/libc/src/math/fmaximumf16.h b/libc/src/math/fmaximumf16.h new file mode 100644 index 0000000..806339f --- /dev/null +++ b/libc/src/math/fmaximumf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fmaximumf16 -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMAXIMUMF16_H +#define LLVM_LIBC_SRC_MATH_FMAXIMUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fmaximumf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMAXIMUMF16_H diff --git a/libc/src/math/fminf16.h b/libc/src/math/fminf16.h new file mode 100644 index 0000000..22d4e6c --- /dev/null +++ b/libc/src/math/fminf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fminf16 -----------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMINF16_H +#define LLVM_LIBC_SRC_MATH_FMINF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fminf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMINF16_H diff --git a/libc/src/math/fminimum_mag_numf16.h b/libc/src/math/fminimum_mag_numf16.h new file mode 100644 index 0000000..0fd314b --- /dev/null +++ b/libc/src/math/fminimum_mag_numf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fminimum_mag_numf16 -----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMINIMUM_MAG_NUMF16_H +#define LLVM_LIBC_SRC_MATH_FMINIMUM_MAG_NUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fminimum_mag_numf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMINIMUM_MAG_NUMF16_H diff --git a/libc/src/math/fminimum_magf16.h b/libc/src/math/fminimum_magf16.h new file mode 100644 index 0000000..27673555 --- /dev/null +++ b/libc/src/math/fminimum_magf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fminimum_magf16 ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMINIMUM_MAGF16_H +#define LLVM_LIBC_SRC_MATH_FMINIMUM_MAGF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fminimum_magf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMINIMUM_MAGF16_H diff --git a/libc/src/math/fminimum_numf16.h b/libc/src/math/fminimum_numf16.h new file mode 100644 index 0000000..598ff9d --- /dev/null +++ b/libc/src/math/fminimum_numf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fminimum_numf16 ---------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMINIMUM_NUMF16_H +#define LLVM_LIBC_SRC_MATH_FMINIMUM_NUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fminimum_numf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMINIMUM_NUMF16_H diff --git a/libc/src/math/fminimumf16.h b/libc/src/math/fminimumf16.h new file mode 100644 index 0000000..86dd240 --- /dev/null +++ b/libc/src/math/fminimumf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for fminimumf16 -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_FMINIMUMF16_H +#define LLVM_LIBC_SRC_MATH_FMINIMUMF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 fminimumf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_FMINIMUMF16_H diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt index 95904be..b1d786f 100644 --- a/libc/src/math/generic/CMakeLists.txt +++ b/libc/src/math/generic/CMakeLists.txt @@ -1783,6 +1783,20 @@ add_entrypoint_object( ) add_entrypoint_object( + fminf16 + SRCS + fminf16.cpp + HDRS + ../fminf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + + +add_entrypoint_object( fmax SRCS fmax.cpp @@ -1832,6 +1846,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fmaxf16 + SRCS + fmaxf16.cpp + HDRS + ../fmaxf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fmaximum SRCS fmaximum.cpp @@ -1868,6 +1895,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fmaximumf16 + SRCS + fmaximumf16.cpp + HDRS + ../fmaximumf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fmaximumf128 SRCS fmaximumf128.cpp @@ -1917,6 +1957,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fmaximum_numf16 + SRCS + fmaximum_numf16.cpp + HDRS + ../fmaximum_numf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fmaximum_numf128 SRCS fmaximum_numf128.cpp @@ -1966,6 +2019,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fmaximum_magf16 + SRCS + fmaximum_magf16.cpp + HDRS + ../fmaximum_magf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fmaximum_magf128 SRCS fmaximum_magf128.cpp @@ -1978,7 +2044,6 @@ add_entrypoint_object( -O3 ) - add_entrypoint_object( fmaximum_mag_num SRCS @@ -2016,6 +2081,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fmaximum_mag_numf16 + SRCS + fmaximum_mag_numf16.cpp + HDRS + ../fmaximum_mag_numf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fmaximum_mag_numf128 SRCS fmaximum_mag_numf128.cpp @@ -2065,6 +2143,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fminimumf16 + SRCS + fminimumf16.cpp + HDRS + ../fminimumf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fminimumf128 SRCS fminimumf128.cpp @@ -2114,6 +2205,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fminimum_numf16 + SRCS + fminimum_numf16.cpp + HDRS + ../fminimum_numf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fminimum_numf128 SRCS fminimum_numf128.cpp @@ -2163,6 +2267,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fminimum_magf16 + SRCS + fminimum_magf16.cpp + HDRS + ../fminimum_magf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fminimum_magf128 SRCS fminimum_magf128.cpp @@ -2175,7 +2292,6 @@ add_entrypoint_object( -O3 ) - add_entrypoint_object( fminimum_mag_num SRCS @@ -2213,6 +2329,19 @@ add_entrypoint_object( ) add_entrypoint_object( + fminimum_mag_numf16 + SRCS + fminimum_mag_numf16.cpp + HDRS + ../fminimum_mag_numf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.basic_operations + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( fminimum_mag_numf128 SRCS fminimum_mag_numf128.cpp @@ -2524,6 +2653,19 @@ add_entrypoint_object( ) add_entrypoint_object( + nextafterf16 + SRCS + nextafterf16.cpp + HDRS + ../nextafterf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.manipulation_functions + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( nextafterf128 SRCS nextafterf128.cpp @@ -2573,6 +2715,19 @@ add_entrypoint_object( ) add_entrypoint_object( + nexttowardf16 + SRCS + nexttowardf16.cpp + HDRS + ../nexttowardf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.manipulation_functions + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( nextdown SRCS nextdown.cpp @@ -2609,6 +2764,19 @@ add_entrypoint_object( ) add_entrypoint_object( + nextdownf16 + SRCS + nextdownf16.cpp + HDRS + ../nextdownf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.manipulation_functions + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( nextdownf128 SRCS nextdownf128.cpp @@ -2658,6 +2826,19 @@ add_entrypoint_object( ) add_entrypoint_object( + nextupf16 + SRCS + nextupf16.cpp + HDRS + ../nextupf16.h + DEPENDS + libc.src.__support.macros.properties.types + libc.src.__support.FPUtil.manipulation_functions + COMPILE_OPTIONS + -O3 +) + +add_entrypoint_object( nextupf128 SRCS nextupf128.cpp diff --git a/libc/src/math/generic/fmaxf16.cpp b/libc/src/math/generic/fmaxf16.cpp new file mode 100644 index 0000000..c317aef --- /dev/null +++ b/libc/src/math/generic/fmaxf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fmaxf16 function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmaxf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fmaxf16, (float16 x, float16 y)) { + return fputil::fmax(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fmaximum_mag_numf16.cpp b/libc/src/math/generic/fmaximum_mag_numf16.cpp new file mode 100644 index 0000000..5055802 --- /dev/null +++ b/libc/src/math/generic/fmaximum_mag_numf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fmaximum_mag_numf16 function --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmaximum_mag_numf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fmaximum_mag_numf16, (float16 x, float16 y)) { + return fputil::fmaximum_mag_num(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fmaximum_magf16.cpp b/libc/src/math/generic/fmaximum_magf16.cpp new file mode 100644 index 0000000..fbd5eac --- /dev/null +++ b/libc/src/math/generic/fmaximum_magf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fmaximum_magf16 function ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmaximum_magf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fmaximum_magf16, (float16 x, float16 y)) { + return fputil::fmaximum_mag(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fmaximum_numf16.cpp b/libc/src/math/generic/fmaximum_numf16.cpp new file mode 100644 index 0000000..187cfbe --- /dev/null +++ b/libc/src/math/generic/fmaximum_numf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fmaximum_numf16 function ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmaximum_numf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fmaximum_numf16, (float16 x, float16 y)) { + return fputil::fmaximum_num(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fmaximumf16.cpp b/libc/src/math/generic/fmaximumf16.cpp new file mode 100644 index 0000000..9e194d2ece --- /dev/null +++ b/libc/src/math/generic/fmaximumf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fmaximumf16 function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fmaximumf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fmaximumf16, (float16 x, float16 y)) { + return fputil::fmaximum(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fminf16.cpp b/libc/src/math/generic/fminf16.cpp new file mode 100644 index 0000000..12547c3 --- /dev/null +++ b/libc/src/math/generic/fminf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fminf16 function --------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fminf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fminf16, (float16 x, float16 y)) { + return fputil::fmin(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fminimum_mag_numf16.cpp b/libc/src/math/generic/fminimum_mag_numf16.cpp new file mode 100644 index 0000000..1a893c6 --- /dev/null +++ b/libc/src/math/generic/fminimum_mag_numf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fminimum_mag_numf16 function --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fminimum_mag_numf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fminimum_mag_numf16, (float16 x, float16 y)) { + return fputil::fminimum_mag_num(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fminimum_magf16.cpp b/libc/src/math/generic/fminimum_magf16.cpp new file mode 100644 index 0000000..45183a9 --- /dev/null +++ b/libc/src/math/generic/fminimum_magf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fminimum_magf16 function ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fminimum_magf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fminimum_magf16, (float16 x, float16 y)) { + return fputil::fminimum_mag(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fminimum_numf16.cpp b/libc/src/math/generic/fminimum_numf16.cpp new file mode 100644 index 0000000..825ad3e --- /dev/null +++ b/libc/src/math/generic/fminimum_numf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fminimum_numf16 function ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fminimum_numf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fminimum_numf16, (float16 x, float16 y)) { + return fputil::fminimum_num(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/fminimumf16.cpp b/libc/src/math/generic/fminimumf16.cpp new file mode 100644 index 0000000..16f738b --- /dev/null +++ b/libc/src/math/generic/fminimumf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of fminimumf16 function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/fminimumf16.h" +#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, fminimumf16, (float16 x, float16 y)) { + return fputil::fminimum(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/nextafterf16.cpp b/libc/src/math/generic/nextafterf16.cpp new file mode 100644 index 0000000..144b3fc --- /dev/null +++ b/libc/src/math/generic/nextafterf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of nextafterf16 function ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/nextafterf16.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, nextafterf16, (float16 x, float16 y)) { + return fputil::nextafter(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/nextdownf16.cpp b/libc/src/math/generic/nextdownf16.cpp new file mode 100644 index 0000000..9fdaa9d --- /dev/null +++ b/libc/src/math/generic/nextdownf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of nextdownf16 function ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/nextdownf16.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, nextdownf16, (float16 x)) { + return fputil::nextupdown</*IsDown=*/true>(x); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/nexttowardf16.cpp b/libc/src/math/generic/nexttowardf16.cpp new file mode 100644 index 0000000..d1d78e8 --- /dev/null +++ b/libc/src/math/generic/nexttowardf16.cpp @@ -0,0 +1,21 @@ +//===-- Implementation of nexttowardf16 function --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/nexttowardf16.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, nexttowardf16, (float16 x, long double y)) { + // We can reuse the nextafter implementation because the internal nextafter is + // templated on the types of the arguments. + return fputil::nextafter(x, y); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/generic/nextupf16.cpp b/libc/src/math/generic/nextupf16.cpp new file mode 100644 index 0000000..5d3d52c --- /dev/null +++ b/libc/src/math/generic/nextupf16.cpp @@ -0,0 +1,19 @@ +//===-- Implementation of nextupf16 function ------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/math/nextupf16.h" +#include "src/__support/FPUtil/ManipulationFunctions.h" +#include "src/__support/common.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(float16, nextupf16, (float16 x)) { + return fputil::nextupdown</*IsDown=*/false>(x); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/math/nextafterf16.h b/libc/src/math/nextafterf16.h new file mode 100644 index 0000000..293569e --- /dev/null +++ b/libc/src/math/nextafterf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for nextafterf16 ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_NEXTAFTERF16_H +#define LLVM_LIBC_SRC_MATH_NEXTAFTERF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 nextafterf16(float16 x, float16 y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_NEXTAFTERF16_H diff --git a/libc/src/math/nextdownf16.h b/libc/src/math/nextdownf16.h new file mode 100644 index 0000000..1913757 --- /dev/null +++ b/libc/src/math/nextdownf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for nextdownf16 -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_NEXTDOWNF16_H +#define LLVM_LIBC_SRC_MATH_NEXTDOWNF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 nextdownf16(float16 x); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_NEXTDOWNF16_H diff --git a/libc/src/math/nexttowardf16.h b/libc/src/math/nexttowardf16.h new file mode 100644 index 0000000..604eb32 --- /dev/null +++ b/libc/src/math/nexttowardf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for nexttowardf16 -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_NEXTTOWARDF16_H +#define LLVM_LIBC_SRC_MATH_NEXTTOWARDF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 nexttowardf16(float16 x, long double y); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_NEXTTOWARDF16_H diff --git a/libc/src/math/nextupf16.h b/libc/src/math/nextupf16.h new file mode 100644 index 0000000..b2973e4 --- /dev/null +++ b/libc/src/math/nextupf16.h @@ -0,0 +1,20 @@ +//===-- Implementation header for nextupf16 ---------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MATH_NEXTUPF16_H +#define LLVM_LIBC_SRC_MATH_NEXTUPF16_H + +#include "src/__support/macros/properties/types.h" + +namespace LIBC_NAMESPACE { + +float16 nextupf16(float16 x); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_MATH_NEXTUPF16_H diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt index e0bff51..219c85d 100644 --- a/libc/src/stdlib/CMakeLists.txt +++ b/libc/src/stdlib/CMakeLists.txt @@ -50,6 +50,7 @@ add_entrypoint_object( quick_exit.h DEPENDS libc.src.__support.OSUtil.osutil + .exit_handler ) add_entrypoint_object( @@ -415,14 +416,14 @@ add_entrypoint_object( libc.src.__support.OSUtil.osutil ) -add_entrypoint_object( - atexit +add_object_library( + exit_handler SRCS - atexit.cpp + exit_handler.cpp HDRS - atexit.h + exit_handler.h CXX_STANDARD - 20 # For constinit of the atexit callback list. + 20 # For constinit DEPENDS libc.src.__support.CPP.mutex libc.src.__support.CPP.new @@ -433,6 +434,26 @@ add_entrypoint_object( ) add_entrypoint_object( + atexit + SRCS + atexit.cpp + HDRS + atexit.h + DEPENDS + .exit_handler +) + +add_entrypoint_object( + at_quick_exit + SRCS + at_quick_exit.cpp + HDRS + at_quick_exit.h + DEPENDS + .exit_handler +) + +add_entrypoint_object( exit SRCS exit.cpp @@ -442,6 +463,7 @@ add_entrypoint_object( ._Exit .atexit libc.src.__support.OSUtil.osutil + .exit_handler ) add_entrypoint_object( diff --git a/libc/src/stdlib/at_quick_exit.cpp b/libc/src/stdlib/at_quick_exit.cpp new file mode 100644 index 0000000..752d67e --- /dev/null +++ b/libc/src/stdlib/at_quick_exit.cpp @@ -0,0 +1,22 @@ +//===-- Implementation of at_quick_exit -----------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/at_quick_exit.h" +#include "hdr/types/atexithandler_t.h" +#include "src/__support/common.h" +#include "src/stdlib/exit_handler.h" + +namespace LIBC_NAMESPACE { + +LLVM_LIBC_FUNCTION(int, at_quick_exit, (__atexithandler_t callback)) { + return add_atexit_unit( + at_quick_exit_callbacks, + {&stdc_at_exit_func, reinterpret_cast<void *>(callback)}); +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/stdlib/at_quick_exit.h b/libc/src/stdlib/at_quick_exit.h new file mode 100644 index 0000000..c36c797 --- /dev/null +++ b/libc/src/stdlib/at_quick_exit.h @@ -0,0 +1,20 @@ +//===-- Implementation header for at_quick_exit -----------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_AT_QUICK_EXIT_H +#define LLVM_LIBC_SRC_STDLIB_AT_QUICK_EXIT_H + +#include "hdr/types/atexithandler_t.h" + +namespace LIBC_NAMESPACE { + +int at_quick_exit(__atexithandler_t); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_STDLIB_AT_QUICK_EXIT_H diff --git a/libc/src/stdlib/atexit.cpp b/libc/src/stdlib/atexit.cpp index 9e37c4c..ca3cbfe 100644 --- a/libc/src/stdlib/atexit.cpp +++ b/libc/src/stdlib/atexit.cpp @@ -7,95 +7,28 @@ //===----------------------------------------------------------------------===// #include "src/stdlib/atexit.h" -#include "src/__support/CPP/mutex.h" // lock_guard -#include "src/__support/blockstore.h" +#include "hdr/types/atexithandler_t.h" #include "src/__support/common.h" -#include "src/__support/fixedvector.h" -#include "src/__support/threads/mutex.h" +#include "src/stdlib/exit_handler.h" namespace LIBC_NAMESPACE { -namespace { - -Mutex handler_list_mtx(/*timed=*/false, /*recursive=*/false, /*robust=*/false, - /*pshared=*/false); - -using AtExitCallback = void(void *); -using StdCAtExitCallback = void(void); - -struct AtExitUnit { - AtExitCallback *callback = nullptr; - void *payload = nullptr; - constexpr AtExitUnit() = default; - constexpr AtExitUnit(AtExitCallback *c, void *p) : callback(c), payload(p) {} -}; - -#if defined(LIBC_TARGET_ARCH_IS_GPU) -// The GPU build cannot handle the potentially recursive definitions required by -// the BlockStore class. Additionally, the liklihood that someone exceeds this -// while executing on the GPU is extremely small. -// FIXME: It is not generally safe to use 'atexit' on the GPU because the -// mutexes simply passthrough. We will need a lock free stack. -using ExitCallbackList = FixedVector<AtExitUnit, 64>; -#elif defined(LIBC_COPT_PUBLIC_PACKAGING) -using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>; -#else -// BlockStore uses dynamic memory allocation. To avoid dynamic memory -// allocation in tests, we use a fixed size callback list when built for -// tests. -// If we use BlockStore, then we will have to pull in malloc etc into -// the tests. While this is not bad, the problem we have currently is -// that LLVM libc' allocator is SCUDO. So, we will end up pulling SCUDO's -// deps also (some of which are not yet available in LLVM libc) into the -// integration tests. -using ExitCallbackList = FixedVector<AtExitUnit, CALLBACK_LIST_SIZE_FOR_TESTS>; -#endif // LIBC_COPT_PUBLIC_PACKAGING - -constinit ExitCallbackList exit_callbacks; - -void stdc_at_exit_func(void *payload) { - reinterpret_cast<StdCAtExitCallback *>(payload)(); -} - -void call_exit_callbacks() { - handler_list_mtx.lock(); - while (!exit_callbacks.empty()) { - AtExitUnit &unit = exit_callbacks.back(); - exit_callbacks.pop_back(); - handler_list_mtx.unlock(); - unit.callback(unit.payload); - handler_list_mtx.lock(); - } - ExitCallbackList::destroy(&exit_callbacks); -} - -int add_atexit_unit(const AtExitUnit &unit) { - cpp::lock_guard lock(handler_list_mtx); - if (exit_callbacks.push_back(unit)) - return 0; - return -1; -} - -} // namespace - extern "C" { -// TODO: Handle the last dso handle argument. int __cxa_atexit(AtExitCallback *callback, void *payload, void *) { - return add_atexit_unit({callback, payload}); + return add_atexit_unit(atexit_callbacks, {callback, payload}); } -// TODO: Handle the dso handle argument. call_exit_callbacks should only invoke -// the callbacks from this DSO. Requires adding support for __dso_handle. void __cxa_finalize(void *dso) { if (!dso) - call_exit_callbacks(); + call_exit_callbacks(atexit_callbacks); } } // extern "C" -LLVM_LIBC_FUNCTION(int, atexit, (StdCAtExitCallback * callback)) { +LLVM_LIBC_FUNCTION(int, atexit, (__atexithandler_t callback)) { return add_atexit_unit( + atexit_callbacks, {&stdc_at_exit_func, reinterpret_cast<void *>(callback)}); } diff --git a/libc/src/stdlib/atexit.h b/libc/src/stdlib/atexit.h index 7cf9d7c..7faaf65 100644 --- a/libc/src/stdlib/atexit.h +++ b/libc/src/stdlib/atexit.h @@ -9,13 +9,10 @@ #ifndef LLVM_LIBC_SRC_STDLIB_ATEXIT_H #define LLVM_LIBC_SRC_STDLIB_ATEXIT_H -#include <stddef.h> // For size_t - +#include "hdr/types/atexithandler_t.h" namespace LIBC_NAMESPACE { -constexpr size_t CALLBACK_LIST_SIZE_FOR_TESTS = 1024; - -int atexit(void (*function)()); +int atexit(__atexithandler_t); } // namespace LIBC_NAMESPACE diff --git a/libc/src/stdlib/exit_handler.cpp b/libc/src/stdlib/exit_handler.cpp new file mode 100644 index 0000000..ed41247 --- /dev/null +++ b/libc/src/stdlib/exit_handler.cpp @@ -0,0 +1,42 @@ +//===--- Implementation of exit_handler------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/stdlib/exit_handler.h" +#include "src/__support/CPP/mutex.h" // lock_guard + +namespace LIBC_NAMESPACE { + +constinit ExitCallbackList at_quick_exit_callbacks; +constinit ExitCallbackList atexit_callbacks; + +Mutex handler_list_mtx(false, false, false, false); + +void stdc_at_exit_func(void *payload) { + reinterpret_cast<StdCAtExitCallback *>(payload)(); +} + +void call_exit_callbacks(ExitCallbackList &callbacks) { + handler_list_mtx.lock(); + while (!callbacks.empty()) { + AtExitUnit &unit = callbacks.back(); + callbacks.pop_back(); + handler_list_mtx.unlock(); + unit.callback(unit.payload); + handler_list_mtx.lock(); + } + ExitCallbackList::destroy(&callbacks); +} + +int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit) { + cpp::lock_guard lock(handler_list_mtx); + if (callbacks.push_back(unit)) + return 0; + return -1; +} + +} // namespace LIBC_NAMESPACE diff --git a/libc/src/stdlib/exit_handler.h b/libc/src/stdlib/exit_handler.h new file mode 100644 index 0000000..8494c2f --- /dev/null +++ b/libc/src/stdlib/exit_handler.h @@ -0,0 +1,53 @@ +//===-- Implementation header for exit_handler ------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STDLIB_EXIT_HANDLER_H +#define LLVM_LIBC_SRC_STDLIB_EXIT_HANDLER_H + +#include "src/__support/CPP/mutex.h" // lock_guard +#include "src/__support/blockstore.h" +#include "src/__support/common.h" +#include "src/__support/fixedvector.h" +#include "src/__support/threads/mutex.h" + +namespace LIBC_NAMESPACE { + +using AtExitCallback = void(void *); +using StdCAtExitCallback = void(void); +constexpr size_t CALLBACK_LIST_SIZE_FOR_TESTS = 1024; + +struct AtExitUnit { + AtExitCallback *callback = nullptr; + void *payload = nullptr; + LIBC_INLINE constexpr AtExitUnit() = default; + LIBC_INLINE constexpr AtExitUnit(AtExitCallback *c, void *p) + : callback(c), payload(p) {} +}; + +#if defined(LIBC_TARGET_ARCH_IS_GPU) +using ExitCallbackList = FixedVector<AtExitUnit, 64>; +#elif defined(LIBC_COPT_PUBLIC_PACKAGING) +using ExitCallbackList = ReverseOrderBlockStore<AtExitUnit, 32>; +#else +using ExitCallbackList = FixedVector<AtExitUnit, CALLBACK_LIST_SIZE_FOR_TESTS>; +#endif + +extern ExitCallbackList atexit_callbacks; +extern ExitCallbackList at_quick_exit_callbacks; + +extern Mutex handler_list_mtx; + +void stdc_at_exit_func(void *payload); + +void call_exit_callbacks(ExitCallbackList &callbacks); + +int add_atexit_unit(ExitCallbackList &callbacks, const AtExitUnit &unit); + +} // namespace LIBC_NAMESPACE + +#endif // LLVM_LIBC_SRC_STDLIB_EXIT_HANDLER_H diff --git a/libc/src/stdlib/quick_exit.cpp b/libc/src/stdlib/quick_exit.cpp index cf7f07b..38f0a3d 100644 --- a/libc/src/stdlib/quick_exit.cpp +++ b/libc/src/stdlib/quick_exit.cpp @@ -9,13 +9,15 @@ #include "src/stdlib/quick_exit.h" #include "src/__support/OSUtil/exit.h" #include "src/__support/common.h" +#include "src/stdlib/exit_handler.h" // extern "C" void __cxa_finalize(void *); - namespace LIBC_NAMESPACE { +extern ExitCallbackList at_quick_exit_callbacks; + [[noreturn]] LLVM_LIBC_FUNCTION(void, quick_exit, (int status)) { - // __cxa_finalize(nullptr); + call_exit_callbacks(at_quick_exit_callbacks); internal::exit(status); } diff --git a/libc/src/sys/epoll/linux/CMakeLists.txt b/libc/src/sys/epoll/linux/CMakeLists.txt index 4e661b2..5ba89bd 100644 --- a/libc/src/sys/epoll/linux/CMakeLists.txt +++ b/libc/src/sys/epoll/linux/CMakeLists.txt @@ -48,6 +48,7 @@ add_entrypoint_object( libc.hdr.types.struct_timespec libc.include.sys_syscall libc.src.__support.OSUtil.osutil + libc.src.__support.macros.sanitizer libc.src.errno.errno ) @@ -65,6 +66,7 @@ add_entrypoint_object( libc.hdr.signal_macros libc.include.sys_syscall libc.src.__support.OSUtil.osutil + libc.src.__support.macros.sanitizer libc.src.errno.errno ) @@ -82,5 +84,6 @@ add_entrypoint_object( libc.hdr.signal_macros libc.include.sys_syscall libc.src.__support.OSUtil.osutil + libc.src.__support.macros.sanitizer libc.src.errno.errno ) diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp index 8f498d1..24b66f0 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp @@ -13,6 +13,7 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/macros/sanitizer.h" #include "src/errno/libc_errno.h" #include <sys/syscall.h> // For syscall numbers. @@ -33,6 +34,8 @@ LLVM_LIBC_FUNCTION(int, epoll_pwait, return -1; } + MSAN_UNPOISON(events, ret * sizeof(struct epoll_event)); + return ret; } diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp index bd33cb6..e13423a 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp @@ -14,6 +14,7 @@ #include "hdr/types/struct_timespec.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/macros/sanitizer.h" #include "src/errno/libc_errno.h" #include <sys/syscall.h> // For syscall numbers. @@ -35,6 +36,8 @@ LLVM_LIBC_FUNCTION(int, epoll_pwait2, return -1; } + MSAN_UNPOISON(events, ret * sizeof(struct epoll_event)); + return ret; } diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp index 95238d8..3ce4a92 100644 --- a/libc/src/sys/epoll/linux/epoll_wait.cpp +++ b/libc/src/sys/epoll/linux/epoll_wait.cpp @@ -13,6 +13,7 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/macros/sanitizer.h" #include "src/errno/libc_errno.h" #include <sys/syscall.h> // For syscall numbers. @@ -39,6 +40,8 @@ LLVM_LIBC_FUNCTION(int, epoll_wait, return -1; } + MSAN_UNPOISON(events, ret * sizeof(struct epoll_event)); + return ret; } diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt index 5e26a10..b5c989a 100644 --- a/libc/test/CMakeLists.txt +++ b/libc/test/CMakeLists.txt @@ -8,10 +8,14 @@ add_custom_target(libc-long-running-tests) add_subdirectory(UnitTest) -if(LIBC_TARGET_OS_IS_GPU AND - (NOT TARGET libc.utils.gpu.loader OR LIBC_GPU_TESTS_DISABLED)) - message(WARNING "Cannot build libc GPU tests, missing loader or architecture") - return() +if(LIBC_TARGET_OS_IS_GPU) + if(NOT TARGET libc.utils.gpu.loader) + message(WARNING "Cannot build libc GPU tests, missing loader.") + return() + elseif(LIBC_GPU_TESTS_DISABLED) + message(WARNING "Cannot build libc GPU tests, missing target architecture.") + return() + endif() endif() add_subdirectory(include) diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index 663aa2b..d05377e 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -86,8 +86,8 @@ add_libc_test( libc.src.__support.uint128 ) -# The GPU does not support varargs currently. -if(NOT LIBC_TARGET_OS_IS_GPU) +# NVPTX does not support varargs currently. +if(NOT LIBC_TARGET_ARCHITECTURE_IS_NVPTX) add_libc_test( arg_list_test SUITE @@ -132,6 +132,7 @@ add_libc_test( SRCS fixedvector_test.cpp DEPENDS + libc.src.__support.CPP.array libc.src.__support.fixedvector ) diff --git a/libc/test/src/__support/CPP/type_traits_test.cpp b/libc/test/src/__support/CPP/type_traits_test.cpp index a2051f3..1c428e9 100644 --- a/libc/test/src/__support/CPP/type_traits_test.cpp +++ b/libc/test/src/__support/CPP/type_traits_test.cpp @@ -112,6 +112,15 @@ TEST(LlvmLibcTypeTraitsTest, add_rvalue_reference_void) { const volatile void>)); } +TEST(LlvmLibcTypeTraitsTest, aligned_storage) { + struct S { + int a, b; + }; + aligned_storage_t<sizeof(S), alignof(S)> buf; + EXPECT_EQ(alignof(buf), alignof(S)); + EXPECT_EQ(sizeof(buf), sizeof(S)); +} + TEST(LlvmLibcTypeTraitsTest, bool_constant) { EXPECT_TRUE((bool_constant<true>::value)); EXPECT_FALSE((bool_constant<false>::value)); diff --git a/libc/test/src/__support/fixedvector_test.cpp b/libc/test/src/__support/fixedvector_test.cpp index e9ffdd0..212e1ae 100644 --- a/libc/test/src/__support/fixedvector_test.cpp +++ b/libc/test/src/__support/fixedvector_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/CPP/array.h" #include "src/__support/fixedvector.h" #include "test/UnitTest/Test.h" @@ -69,3 +70,29 @@ TEST(LlvmLibcFixedVectorTest, Iteration) { for (int &x : v) ASSERT_GE(x, 0); } + +TEST(LlvmLibcFixedVectorTest, ConstructionFromIterators) { + LIBC_NAMESPACE::cpp::array<int, 4> arr{1, 2, 3, 4}; + LIBC_NAMESPACE::FixedVector<int, 5> vec(arr.begin(), arr.end()); + ASSERT_EQ(vec.size(), arr.size()); + for (size_t i = 0; i < arr.size(); ++i) + ASSERT_EQ(vec[i], arr[i]); +} + +TEST(LlvmLibcFixedVectorTest, ConstructionFromCountAndValue) { + constexpr int kVal = 10; + LIBC_NAMESPACE::FixedVector<int, 5> vec(4, kVal); + ASSERT_EQ(vec.size(), size_t(4)); + for (size_t i = 0; i < vec.size(); ++i) + ASSERT_EQ(vec[i], kVal); +} + +TEST(LlvmLibcFixedVectorTest, ForwardIteration) { + LIBC_NAMESPACE::cpp::array<int, 4> arr{1, 2, 3, 4}; + LIBC_NAMESPACE::FixedVector<int, 5> vec(arr.begin(), arr.end()); + ASSERT_EQ(vec.size(), arr.size()); + for (auto it = vec.begin(); it != vec.end(); ++it) { + auto idx = it - vec.begin(); + ASSERT_EQ(*it, arr[idx]); + } +} diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt index 09e54349..110fa1d 100644 --- a/libc/test/src/math/smoke/CMakeLists.txt +++ b/libc/test/src/math/smoke/CMakeLists.txt @@ -1705,6 +1705,7 @@ add_fp_unittest( FMinTest.h DEPENDS libc.src.math.fminf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1718,6 +1719,7 @@ add_fp_unittest( FMinTest.h DEPENDS libc.src.math.fmin + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1731,6 +1733,7 @@ add_fp_unittest( FMinTest.h DEPENDS libc.src.math.fminl + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1744,6 +1747,21 @@ add_fp_unittest( FMinTest.h DEPENDS libc.src.math.fminf128 + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fminf16_test + SUITE + libc-math-smoke-tests + SRCS + fminf16_test.cpp + HDRS + FMinTest.h + DEPENDS + libc.src.math.fminf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1757,6 +1775,7 @@ add_fp_unittest( FMaxTest.h DEPENDS libc.src.math.fmaxf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1770,6 +1789,7 @@ add_fp_unittest( FMaxTest.h DEPENDS libc.src.math.fmax + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1783,6 +1803,7 @@ add_fp_unittest( FMaxTest.h DEPENDS libc.src.math.fmaxl + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1796,6 +1817,21 @@ add_fp_unittest( FMaxTest.h DEPENDS libc.src.math.fmaxf128 + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmaxf16_test + SUITE + libc-math-smoke-tests + SRCS + fmaxf16_test.cpp + HDRS + FMaxTest.h + DEPENDS + libc.src.math.fmaxf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1809,6 +1845,21 @@ add_fp_unittest( FMaximumTest.h DEPENDS libc.src.math.fmaximuml + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmaximumf16_test + SUITE + libc-math-smoke-tests + SRCS + fmaximumf16_test.cpp + HDRS + FMaximumTest.h + DEPENDS + libc.src.math.fmaximumf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1822,6 +1873,7 @@ add_fp_unittest( FMaximumTest.h DEPENDS libc.src.math.fmaximumf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1835,6 +1887,7 @@ add_fp_unittest( FMaximumTest.h DEPENDS libc.src.math.fmaximum + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1848,6 +1901,7 @@ add_fp_unittest( FMaximumTest.h DEPENDS libc.src.math.fmaximumf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1861,6 +1915,7 @@ add_fp_unittest( FMaximumNumTest.h DEPENDS libc.src.math.fmaximum_numf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1874,6 +1929,7 @@ add_fp_unittest( FMaximumNumTest.h DEPENDS libc.src.math.fmaximum_num + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1887,6 +1943,21 @@ add_fp_unittest( FMaximumNumTest.h DEPENDS libc.src.math.fmaximum_numl + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmaximum_numf16_test + SUITE + libc-math-smoke-tests + SRCS + fmaximum_numf16_test.cpp + HDRS + FMaximumNumTest.h + DEPENDS + libc.src.math.fmaximum_numf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1900,6 +1971,7 @@ add_fp_unittest( FMaximumNumTest.h DEPENDS libc.src.math.fmaximum_numf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -1913,6 +1985,8 @@ add_fp_unittest( FMaximumMagTest.h DEPENDS libc.src.math.fmaximum_magf + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -1926,6 +2000,8 @@ add_fp_unittest( FMaximumMagTest.h DEPENDS libc.src.math.fmaximum_mag + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -1939,6 +2015,23 @@ add_fp_unittest( FMaximumMagTest.h DEPENDS libc.src.math.fmaximum_magl + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmaximum_magf16_test + SUITE + libc-math-smoke-tests + SRCS + fmaximum_magf16_test.cpp + HDRS + FMaximumMagTest.h + DEPENDS + libc.src.math.fmaximum_magf16 + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -1952,10 +2045,11 @@ add_fp_unittest( FMaximumMagTest.h DEPENDS libc.src.math.fmaximum_magf128 + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) - add_fp_unittest( fmaximum_mag_numf_test SUITE @@ -1966,6 +2060,7 @@ add_fp_unittest( FMaximumMagNumTest.h DEPENDS libc.src.math.fmaximum_mag_numf + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -1979,6 +2074,7 @@ add_fp_unittest( FMaximumMagNumTest.h DEPENDS libc.src.math.fmaximum_mag_num + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -1992,6 +2088,21 @@ add_fp_unittest( FMaximumMagNumTest.h DEPENDS libc.src.math.fmaximum_mag_numl + libc.src.__support.FPUtil.basic_operations + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fmaximum_mag_numf16_test + SUITE + libc-math-smoke-tests + SRCS + fmaximum_mag_numf16_test.cpp + HDRS + FMaximumMagNumTest.h + DEPENDS + libc.src.math.fmaximum_mag_numf16 + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -2005,6 +2116,7 @@ add_fp_unittest( FMaximumMagNumTest.h DEPENDS libc.src.math.fmaximum_mag_numf128 + libc.src.__support.FPUtil.basic_operations libc.src.__support.FPUtil.fp_bits ) @@ -2018,6 +2130,21 @@ add_fp_unittest( FMinimumTest.h DEPENDS libc.src.math.fminimuml + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fminimumf16_test + SUITE + libc-math-smoke-tests + SRCS + fminimumf16_test.cpp + HDRS + FMinimumTest.h + DEPENDS + libc.src.math.fminimumf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2031,6 +2158,7 @@ add_fp_unittest( FMinimumTest.h DEPENDS libc.src.math.fminimumf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2044,6 +2172,7 @@ add_fp_unittest( FMinimumTest.h DEPENDS libc.src.math.fminimum + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2057,6 +2186,7 @@ add_fp_unittest( FMinimumTest.h DEPENDS libc.src.math.fminimumf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2070,6 +2200,7 @@ add_fp_unittest( FMinimumNumTest.h DEPENDS libc.src.math.fminimum_numf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2083,6 +2214,7 @@ add_fp_unittest( FMinimumNumTest.h DEPENDS libc.src.math.fminimum_num + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2096,6 +2228,21 @@ add_fp_unittest( FMinimumNumTest.h DEPENDS libc.src.math.fminimum_numl + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fminimum_numf16_test + SUITE + libc-math-smoke-tests + SRCS + fminimum_numf16_test.cpp + HDRS + FMinimumNumTest.h + DEPENDS + libc.src.math.fminimum_numf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2109,6 +2256,7 @@ add_fp_unittest( FMinimumNumTest.h DEPENDS libc.src.math.fminimum_numf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2122,6 +2270,7 @@ add_fp_unittest( FMinimumMagTest.h DEPENDS libc.src.math.fminimum_magf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2135,6 +2284,7 @@ add_fp_unittest( FMinimumMagTest.h DEPENDS libc.src.math.fminimum_mag + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2148,6 +2298,21 @@ add_fp_unittest( FMinimumMagTest.h DEPENDS libc.src.math.fminimum_magl + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fminimum_magf16_test + SUITE + libc-math-smoke-tests + SRCS + fminimum_magf16_test.cpp + HDRS + FMinimumMagTest.h + DEPENDS + libc.src.math.fminimum_magf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2161,10 +2326,10 @@ add_fp_unittest( FMinimumMagTest.h DEPENDS libc.src.math.fminimum_magf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) - add_fp_unittest( fminimum_mag_numf_test SUITE @@ -2175,6 +2340,7 @@ add_fp_unittest( FMinimumMagNumTest.h DEPENDS libc.src.math.fminimum_mag_numf + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2188,6 +2354,7 @@ add_fp_unittest( FMinimumMagNumTest.h DEPENDS libc.src.math.fminimum_mag_num + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2201,6 +2368,21 @@ add_fp_unittest( FMinimumMagNumTest.h DEPENDS libc.src.math.fminimum_mag_numl + libc.src.__support.CPP.algorithm + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + fminimum_mag_numf16_test + SUITE + libc-math-smoke-tests + SRCS + fminimum_mag_numf16_test.cpp + HDRS + FMinimumMagNumTest.h + DEPENDS + libc.src.math.fminimum_mag_numf16 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2214,6 +2396,7 @@ add_fp_unittest( FMinimumMagNumTest.h DEPENDS libc.src.math.fminimum_mag_numf128 + libc.src.__support.CPP.algorithm libc.src.__support.FPUtil.fp_bits ) @@ -2515,8 +2698,10 @@ add_fp_unittest( HDRS NextAfterTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nextafter - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2529,8 +2714,10 @@ add_fp_unittest( HDRS NextAfterTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nextafterf - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2543,8 +2730,26 @@ add_fp_unittest( HDRS NextAfterTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nextafterl - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + nextafterf16_test + SUITE + libc-math-smoke-tests + SRCS + nextafterf16_test.cpp + HDRS + NextAfterTest.h + DEPENDS + libc.hdr.fenv_macros + libc.src.math.nextafterf16 + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2557,8 +2762,10 @@ add_fp_unittest( HDRS NextAfterTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nextafterf128 - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2573,8 +2780,10 @@ if(NOT LIBC_TARGET_OS_IS_GPU) HDRS NextTowardTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nexttoward - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2587,8 +2796,10 @@ if(NOT LIBC_TARGET_OS_IS_GPU) HDRS NextTowardTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nexttowardf - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) endif() @@ -2602,8 +2813,26 @@ add_fp_unittest( HDRS NextTowardTest.h DEPENDS + libc.hdr.fenv_macros libc.src.math.nexttowardl - libc.src.__support.FPUtil.basic_operations + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl + libc.src.__support.FPUtil.fp_bits +) + +add_fp_unittest( + nexttowardf16_test + SUITE + libc-math-smoke-tests + SRCS + nexttowardf16_test.cpp + HDRS + NextTowardTest.h + DEPENDS + libc.hdr.fenv_macros + libc.src.math.nexttowardf16 + libc.src.__support.CPP.bit + libc.src.__support.FPUtil.fenv_impl libc.src.__support.FPUtil.fp_bits ) @@ -2617,7 +2846,6 @@ add_fp_unittest( NextDownTest.h DEPENDS libc.src.math.nextdown - libc.src.__support.FPUtil.manipulation_functions ) add_fp_unittest( @@ -2630,7 +2858,6 @@ add_fp_unittest( NextDownTest.h DEPENDS libc.src.math.nextdownf - libc.src.__support.FPUtil.manipulation_functions ) add_fp_unittest( @@ -2643,7 +2870,18 @@ add_fp_unittest( NextDownTest.h DEPENDS libc.src.math.nextdownl - libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + nextdownf16_test + SUITE + libc-math-smoke-tests + SRCS + nextdownf16_test.cpp + HDRS + NextDownTest.h + DEPENDS + libc.src.math.nextdownf16 ) add_fp_unittest( @@ -2656,7 +2894,6 @@ add_fp_unittest( NextDownTest.h DEPENDS libc.src.math.nextdownf128 - libc.src.__support.FPUtil.manipulation_functions ) add_fp_unittest( @@ -2669,7 +2906,6 @@ add_fp_unittest( NextUpTest.h DEPENDS libc.src.math.nextup - libc.src.__support.FPUtil.manipulation_functions ) add_fp_unittest( @@ -2682,7 +2918,6 @@ add_fp_unittest( NextUpTest.h DEPENDS libc.src.math.nextupf - libc.src.__support.FPUtil.manipulation_functions ) add_fp_unittest( @@ -2695,7 +2930,18 @@ add_fp_unittest( NextUpTest.h DEPENDS libc.src.math.nextupl - libc.src.__support.FPUtil.manipulation_functions +) + +add_fp_unittest( + nextupf16_test + SUITE + libc-math-smoke-tests + SRCS + nextupf16_test.cpp + HDRS + NextUpTest.h + DEPENDS + libc.src.math.nextupf16 ) add_fp_unittest( @@ -2708,7 +2954,6 @@ add_fp_unittest( NextUpTest.h DEPENDS libc.src.math.nextupf128 - libc.src.__support.FPUtil.manipulation_functions ) # TODO(lntue): The current implementation of fputil::general::fma<float> is only diff --git a/libc/test/src/math/smoke/FMaxTest.h b/libc/test/src/math/smoke/FMaxTest.h index df8e35e..f4c78b5 100644 --- a/libc/test/src/math/smoke/FMaxTest.h +++ b/libc/test/src/math/smoke/FMaxTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXTEST_H +#include "src/__support/CPP/algorithm.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -55,10 +56,11 @@ public: } void testRange(FMaxFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; diff --git a/libc/test/src/math/smoke/FMaximumMagNumTest.h b/libc/test/src/math/smoke/FMaximumMagNumTest.h index aafb6d2..726f870 100644 --- a/libc/test/src/math/smoke/FMaximumMagNumTest.h +++ b/libc/test/src/math/smoke/FMaximumMagNumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMMAG_NUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMMAG_NUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" @@ -68,10 +69,11 @@ public: } void testRange(FMaximumMagNumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -82,11 +84,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) { + if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) EXPECT_FP_EQ(x, func(x, y)); - } else { + else EXPECT_FP_EQ(y, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMaximumMagTest.h b/libc/test/src/math/smoke/FMaximumMagTest.h index 7bb79a6..b5b2c1c 100644 --- a/libc/test/src/math/smoke/FMaximumMagTest.h +++ b/libc/test/src/math/smoke/FMaximumMagTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUM_MAGTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUM_MAGTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/BasicOperations.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" @@ -56,10 +57,11 @@ public: } void testRange(FMaximumMagFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -70,11 +72,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) { + if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) EXPECT_FP_EQ(x, func(x, y)); - } else { + else EXPECT_FP_EQ(y, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMaximumNumTest.h b/libc/test/src/math/smoke/FMaximumNumTest.h index da0ea2c..ec79135 100644 --- a/libc/test/src/math/smoke/FMaximumNumTest.h +++ b/libc/test/src/math/smoke/FMaximumNumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMNUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMNUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" @@ -67,10 +68,11 @@ public: } void testRange(FMaximumNumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -81,11 +83,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (x > y) { + if (x > y) EXPECT_FP_EQ(x, func(x, y)); - } else { + else EXPECT_FP_EQ(y, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMaximumTest.h b/libc/test/src/math/smoke/FMaximumTest.h index 1bd1516..94e4a34 100644 --- a/libc/test/src/math/smoke/FMaximumTest.h +++ b/libc/test/src/math/smoke/FMaximumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMAXIMUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -55,10 +56,11 @@ public: } void testRange(FMaximumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -69,11 +71,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (x > y) { + if (x > y) EXPECT_FP_EQ(x, func(x, y)); - } else { + else EXPECT_FP_EQ(y, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMinTest.h b/libc/test/src/math/smoke/FMinTest.h index f71b558..629aaab 100644 --- a/libc/test/src/math/smoke/FMinTest.h +++ b/libc/test/src/math/smoke/FMinTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINTEST_H +#include "src/__support/CPP/algorithm.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -55,10 +56,11 @@ public: } void testRange(FMinFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; diff --git a/libc/test/src/math/smoke/FMinimumMagNumTest.h b/libc/test/src/math/smoke/FMinimumMagNumTest.h index e4b8fd9..2ceca6f 100644 --- a/libc/test/src/math/smoke/FMinimumMagNumTest.h +++ b/libc/test/src/math/smoke/FMinimumMagNumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMMAG_NUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMMAG_NUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/BasicOperations.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" @@ -68,10 +69,11 @@ public: } void testRange(FMinimumMagNumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -82,11 +84,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) { + if (LIBC_NAMESPACE::fputil::abs(x) > LIBC_NAMESPACE::fputil::abs(y)) EXPECT_FP_EQ(y, func(x, y)); - } else { + else EXPECT_FP_EQ(x, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMinimumMagTest.h b/libc/test/src/math/smoke/FMinimumMagTest.h index 3e16622..9c49446 100644 --- a/libc/test/src/math/smoke/FMinimumMagTest.h +++ b/libc/test/src/math/smoke/FMinimumMagTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUM_MAGTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUM_MAGTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/BasicOperations.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" @@ -56,10 +57,11 @@ public: } void testRange(FMinimumMagFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -70,11 +72,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (LIBC_NAMESPACE::fputil::abs(x) < LIBC_NAMESPACE::fputil::abs(y)) { + if (LIBC_NAMESPACE::fputil::abs(x) < LIBC_NAMESPACE::fputil::abs(y)) EXPECT_FP_EQ(x, func(x, y)); - } else { + else EXPECT_FP_EQ(y, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMinimumNumTest.h b/libc/test/src/math/smoke/FMinimumNumTest.h index 6186ea0..8004ee9 100644 --- a/libc/test/src/math/smoke/FMinimumNumTest.h +++ b/libc/test/src/math/smoke/FMinimumNumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMNUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMNUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" @@ -67,10 +68,11 @@ public: } void testRange(FMinimumNumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -81,11 +83,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (x > y) { + if (x > y) EXPECT_FP_EQ(y, func(x, y)); - } else { + else EXPECT_FP_EQ(x, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/FMinimumTest.h b/libc/test/src/math/smoke/FMinimumTest.h index a267f6c..242c857 100644 --- a/libc/test/src/math/smoke/FMinimumTest.h +++ b/libc/test/src/math/smoke/FMinimumTest.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMTEST_H #define LLVM_LIBC_TEST_SRC_MATH_SMOKE_FMINIMUMTEST_H +#include "src/__support/CPP/algorithm.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -55,10 +56,11 @@ public: } void testRange(FMinimumFunc func) { - constexpr StorageType COUNT = 100'001; - constexpr StorageType STEP = STORAGE_MAX / COUNT; - for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT; - ++i, v += STEP, w -= STEP) { + constexpr int COUNT = 100'001; + constexpr StorageType STEP = LIBC_NAMESPACE::cpp::max( + static_cast<StorageType>(STORAGE_MAX / COUNT), StorageType(1)); + StorageType v = 0, w = STORAGE_MAX; + for (int i = 0; i <= COUNT; ++i, v += STEP, w -= STEP) { FPBits xbits(v), ybits(w); if (xbits.is_inf_or_nan()) continue; @@ -69,11 +71,10 @@ public: if ((x == 0) && (y == 0)) continue; - if (x > y) { + if (x > y) EXPECT_FP_EQ(y, func(x, y)); - } else { + else EXPECT_FP_EQ(x, func(x, y)); - } } } }; diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h index d65ccdf..6278f89 100644 --- a/libc/test/src/math/smoke/NextAfterTest.h +++ b/libc/test/src/math/smoke/NextAfterTest.h @@ -9,15 +9,15 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H #define LLVM_LIBC_TEST_SRC_MATH_NEXTAFTERTEST_H -#include "hdr/math_macros.h" #include "src/__support/CPP/bit.h" -#include "src/__support/CPP/type_traits.h" -#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" +#include "hdr/fenv_macros.h" + // TODO: Strengthen errno,exception checks and remove these assert macros // after new matchers/test fixtures are added #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception) \ @@ -181,7 +181,7 @@ public: result_bits = FPBits(result); ASSERT_EQ(result_bits.get_biased_exponent(), x_bits.get_biased_exponent()); ASSERT_EQ(result_bits.get_mantissa(), - x_bits.get_mantissa() + StorageType(1)); + static_cast<StorageType>(x_bits.get_mantissa() + StorageType(1))); x = -x; @@ -195,7 +195,7 @@ public: result_bits = FPBits(result); ASSERT_EQ(result_bits.get_biased_exponent(), x_bits.get_biased_exponent()); ASSERT_EQ(result_bits.get_mantissa(), - x_bits.get_mantissa() + StorageType(1)); + static_cast<StorageType>(x_bits.get_mantissa() + StorageType(1))); } }; diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h index a24ec9f..5992273 100644 --- a/libc/test/src/math/smoke/NextTowardTest.h +++ b/libc/test/src/math/smoke/NextTowardTest.h @@ -9,16 +9,15 @@ #ifndef LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H #define LLVM_LIBC_TEST_SRC_MATH_NEXTTOWARDTEST_H -#include "hdr/fenv_macros.h" -#include "hdr/math_macros.h" #include "src/__support/CPP/bit.h" -#include "src/__support/CPP/type_traits.h" -#include "src/__support/FPUtil/BasicOperations.h" +#include "src/__support/FPUtil/FEnvImpl.h" #include "src/__support/FPUtil/FPBits.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" +#include "hdr/fenv_macros.h" + // TODO: Strengthen errno,exception checks and remove these assert macros // after new matchers/test fixtures are added #define ASSERT_FP_EQ_WITH_EXCEPTION(result, expected, expected_exception) \ @@ -194,7 +193,7 @@ public: result_bits = FPBits(result); ASSERT_EQ(result_bits.get_biased_exponent(), x_bits.get_biased_exponent()); ASSERT_EQ(result_bits.get_mantissa(), - x_bits.get_mantissa() + StorageType(1)); + static_cast<StorageType>(x_bits.get_mantissa() + StorageType(1))); x = -x; @@ -208,7 +207,7 @@ public: result_bits = FPBits(result); ASSERT_EQ(result_bits.get_biased_exponent(), x_bits.get_biased_exponent()); ASSERT_EQ(result_bits.get_mantissa(), - x_bits.get_mantissa() + StorageType(1)); + static_cast<StorageType>(x_bits.get_mantissa() + StorageType(1))); } }; diff --git a/libc/test/src/math/smoke/fmaxf16_test.cpp b/libc/test/src/math/smoke/fmaxf16_test.cpp new file mode 100644 index 0000000..79c03b7 --- /dev/null +++ b/libc/test/src/math/smoke/fmaxf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaxf16 --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaxTest.h" + +#include "src/math/fmaxf16.h" + +LIST_FMAX_TESTS(float16, LIBC_NAMESPACE::fmaxf16) diff --git a/libc/test/src/math/smoke/fmaximum_mag_numf16_test.cpp b/libc/test/src/math/smoke/fmaximum_mag_numf16_test.cpp new file mode 100644 index 0000000..b11653e --- /dev/null +++ b/libc/test/src/math/smoke/fmaximum_mag_numf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaximum_mag_numf16 ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaximumMagNumTest.h" + +#include "src/math/fmaximum_mag_numf16.h" + +LIST_FMAXIMUM_MAG_NUM_TESTS(float16, LIBC_NAMESPACE::fmaximum_mag_numf16) diff --git a/libc/test/src/math/smoke/fmaximum_magf16_test.cpp b/libc/test/src/math/smoke/fmaximum_magf16_test.cpp new file mode 100644 index 0000000..6df1e4a --- /dev/null +++ b/libc/test/src/math/smoke/fmaximum_magf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaximum_magf16 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaximumMagTest.h" + +#include "src/math/fmaximum_magf16.h" + +LIST_FMAXIMUM_MAG_TESTS(float16, LIBC_NAMESPACE::fmaximum_magf16) diff --git a/libc/test/src/math/smoke/fmaximum_numf16_test.cpp b/libc/test/src/math/smoke/fmaximum_numf16_test.cpp new file mode 100644 index 0000000..7cb9cb0 --- /dev/null +++ b/libc/test/src/math/smoke/fmaximum_numf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaximum_numf16 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaximumNumTest.h" + +#include "src/math/fmaximum_numf16.h" + +LIST_FMAXIMUM_NUM_TESTS(float16, LIBC_NAMESPACE::fmaximum_numf16) diff --git a/libc/test/src/math/smoke/fmaximumf16_test.cpp b/libc/test/src/math/smoke/fmaximumf16_test.cpp new file mode 100644 index 0000000..4cbf846 --- /dev/null +++ b/libc/test/src/math/smoke/fmaximumf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fmaximumf16 -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMaximumTest.h" + +#include "src/math/fmaximumf16.h" + +LIST_FMAXIMUM_TESTS(float16, LIBC_NAMESPACE::fmaximumf16) diff --git a/libc/test/src/math/smoke/fminf16_test.cpp b/libc/test/src/math/smoke/fminf16_test.cpp new file mode 100644 index 0000000..4379a6e --- /dev/null +++ b/libc/test/src/math/smoke/fminf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminf16 ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinTest.h" + +#include "src/math/fminf16.h" + +LIST_FMIN_TESTS(float16, LIBC_NAMESPACE::fminf16) diff --git a/libc/test/src/math/smoke/fminimum_mag_numf16_test.cpp b/libc/test/src/math/smoke/fminimum_mag_numf16_test.cpp new file mode 100644 index 0000000..2c6aede --- /dev/null +++ b/libc/test/src/math/smoke/fminimum_mag_numf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminimum_mag_numf16 ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinimumMagNumTest.h" + +#include "src/math/fminimum_mag_numf16.h" + +LIST_FMINIMUM_MAG_NUM_TESTS(float16, LIBC_NAMESPACE::fminimum_mag_numf16) diff --git a/libc/test/src/math/smoke/fminimum_magf16_test.cpp b/libc/test/src/math/smoke/fminimum_magf16_test.cpp new file mode 100644 index 0000000..3687aec --- /dev/null +++ b/libc/test/src/math/smoke/fminimum_magf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminimum_magf16 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinimumMagTest.h" + +#include "src/math/fminimum_magf16.h" + +LIST_FMINIMUM_MAG_TESTS(float16, LIBC_NAMESPACE::fminimum_magf16) diff --git a/libc/test/src/math/smoke/fminimum_numf16_test.cpp b/libc/test/src/math/smoke/fminimum_numf16_test.cpp new file mode 100644 index 0000000..6775081 --- /dev/null +++ b/libc/test/src/math/smoke/fminimum_numf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminimum_numf16 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinimumNumTest.h" + +#include "src/math/fminimum_numf16.h" + +LIST_FMINIMUM_NUM_TESTS(float16, LIBC_NAMESPACE::fminimum_numf16) diff --git a/libc/test/src/math/smoke/fminimumf16_test.cpp b/libc/test/src/math/smoke/fminimumf16_test.cpp new file mode 100644 index 0000000..f8b0577 --- /dev/null +++ b/libc/test/src/math/smoke/fminimumf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for fminimumf16 -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "FMinimumTest.h" + +#include "src/math/fminimumf16.h" + +LIST_FMINIMUM_TESTS(float16, LIBC_NAMESPACE::fminimumf16) diff --git a/libc/test/src/math/smoke/nextafterf16_test.cpp b/libc/test/src/math/smoke/nextafterf16_test.cpp new file mode 100644 index 0000000..860a0c7 --- /dev/null +++ b/libc/test/src/math/smoke/nextafterf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextafterf16 ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextAfterTest.h" + +#include "src/math/nextafterf16.h" + +LIST_NEXTAFTER_TESTS(float16, LIBC_NAMESPACE::nextafterf16) diff --git a/libc/test/src/math/smoke/nextdownf16_test.cpp b/libc/test/src/math/smoke/nextdownf16_test.cpp new file mode 100644 index 0000000..353f085 --- /dev/null +++ b/libc/test/src/math/smoke/nextdownf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextdownf16 -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextDownTest.h" + +#include "src/math/nextdownf16.h" + +LIST_NEXTDOWN_TESTS(float16, LIBC_NAMESPACE::nextdownf16) diff --git a/libc/test/src/math/smoke/nexttowardf16_test.cpp b/libc/test/src/math/smoke/nexttowardf16_test.cpp new file mode 100644 index 0000000..8490e8d --- /dev/null +++ b/libc/test/src/math/smoke/nexttowardf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nexttowardf16 ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextTowardTest.h" + +#include "src/math/nexttowardf16.h" + +LIST_NEXTTOWARD_TESTS(float16, LIBC_NAMESPACE::nexttowardf16) diff --git a/libc/test/src/math/smoke/nextupf16_test.cpp b/libc/test/src/math/smoke/nextupf16_test.cpp new file mode 100644 index 0000000..a146d27 --- /dev/null +++ b/libc/test/src/math/smoke/nextupf16_test.cpp @@ -0,0 +1,13 @@ +//===-- Unittests for nextupf16 -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "NextUpTest.h" + +#include "src/math/nextupf16.h" + +LIST_NEXTUP_TESTS(float16, LIBC_NAMESPACE::nextupf16) diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 6a7faed..3848877 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -354,7 +354,20 @@ if(LLVM_LIBC_FULL_BUILD) libc.src.stdlib.exit libc.src.stdlib.atexit libc.src.__support.CPP.array - libc.src.__support.CPP.utility + ) + + add_libc_test( + at_quick_exit_test + # The EXPECT_EXITS test is only availible for unit tests. + UNIT_TEST_ONLY + SUITE + libc-stdlib-tests + SRCS + at_quick_exit_test.cpp + DEPENDS + libc.src.stdlib.quick_exit + libc.src.stdlib.at_quick_exit + libc.src.__support.CPP.array ) add_libc_test( diff --git a/libc/test/src/stdlib/at_quick_exit_test.cpp b/libc/test/src/stdlib/at_quick_exit_test.cpp new file mode 100644 index 0000000..e0a258d --- /dev/null +++ b/libc/test/src/stdlib/at_quick_exit_test.cpp @@ -0,0 +1,90 @@ +//===-- Unittests for at_quick_exit ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/utility.h" +#include "src/stdlib/at_quick_exit.h" +#include "src/stdlib/quick_exit.h" +#include "test/UnitTest/Test.h" + +static int a; +TEST(LlvmLibcAtQuickExit, Basic) { + // In case tests ever run multiple times. + a = 0; + + auto test = [] { + int status = LIBC_NAMESPACE::at_quick_exit(+[] { + if (a != 1) + __builtin_trap(); + }); + status |= LIBC_NAMESPACE::at_quick_exit(+[] { a++; }); + if (status) + __builtin_trap(); + + LIBC_NAMESPACE::quick_exit(0); + }; + EXPECT_EXITS(test, 0); +} + +TEST(LlvmLibcAtQuickExit, AtQuickExitCallsSysExit) { + auto test = [] { + LIBC_NAMESPACE::at_quick_exit(+[] { _Exit(1); }); + LIBC_NAMESPACE::quick_exit(0); + }; + EXPECT_EXITS(test, 1); +} + +static int size; +static LIBC_NAMESPACE::cpp::array<int, 256> arr; + +template <int... Ts> +void register_at_quick_exit_handlers( + LIBC_NAMESPACE::cpp::integer_sequence<int, Ts...>) { + (LIBC_NAMESPACE::at_quick_exit(+[] { arr[size++] = Ts; }), ...); +} + +template <int count> constexpr auto get_test() { + return [] { + LIBC_NAMESPACE::at_quick_exit(+[] { + if (size != count) + __builtin_trap(); + for (int i = 0; i < count; i++) + if (arr[i] != count - 1 - i) + __builtin_trap(); + }); + register_at_quick_exit_handlers( + LIBC_NAMESPACE::cpp::make_integer_sequence<int, count>{}); + LIBC_NAMESPACE::quick_exit(0); + }; +} + +TEST(LlvmLibcAtQuickExit, ReverseOrder) { + // In case tests ever run multiple times. + size = 0; + + auto test = get_test<32>(); + EXPECT_EXITS(test, 0); +} + +TEST(LlvmLibcAtQuickExit, Many) { + // In case tests ever run multiple times. + size = 0; + + auto test = get_test<256>(); + EXPECT_EXITS(test, 0); +} + +TEST(LlvmLibcAtQuickExit, HandlerCallsAtQuickExit) { + auto test = [] { + LIBC_NAMESPACE::at_quick_exit(+[] { + LIBC_NAMESPACE::at_quick_exit(+[] { LIBC_NAMESPACE::quick_exit(1); }); + }); + LIBC_NAMESPACE::quick_exit(0); + }; + EXPECT_EXITS(test, 1); +} diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 5ce1795..9858ae9 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -198,7 +198,7 @@ set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 - gfx1150 gfx1151 + gfx1150 gfx1151 gfx1152 gfx1200 gfx1201 ) diff --git a/libcxx/include/__type_traits/datasizeof.h b/libcxx/include/__type_traits/datasizeof.h index 54fde24..35c1292 100644 --- a/libcxx/include/__type_traits/datasizeof.h +++ b/libcxx/include/__type_traits/datasizeof.h @@ -26,7 +26,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -#if __has_extension(datasizeof) +#if __has_keyword(__datasizeof) || __has_extension(datasizeof) template <class _Tp> inline const size_t __datasizeof_v = __datasizeof(_Tp); #else diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp index 8fcc811..7d91ca0 100644 --- a/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp +++ b/libcxx/test/std/experimental/simd/simd.class/simd_copy.pass.cpp @@ -8,9 +8,9 @@ // UNSUPPORTED: c++03, c++11, c++14 -// FIXME: Fatal error with following targets (remove XFAIL when fixed): +// Older versions of clang may encounter a backend error (see 0295c2ad): // Pass-by-value arguments with alignment greater than register width are not supported. -// XFAIL: target=powerpc{{.*}}-ibm-aix7.2.5.7 +// XFAIL: target=powerpc{{.*}}-ibm-{{.*}} && (clang-17 || clang-18) // <experimental/simd> // diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp index ff6d9be..544db20 100644 --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -1320,8 +1320,8 @@ template <class ELFT> void Writer<ELFT>::sortOrphanSections() { i = firstSectionOrDotAssignment; while (nonScriptI != e) { - OutputSection *orphan = &cast<OutputDesc>(*nonScriptI)->osec; auto pos = findOrphanPos(i, nonScriptI); + OutputSection *orphan = &cast<OutputDesc>(*nonScriptI)->osec; // As an optimization, find all sections with the same sort rank // and insert them with one rotate. diff --git a/lld/test/ELF/linkerscript/sections-nonalloc.s b/lld/test/ELF/linkerscript/sections-nonalloc.s index b4fab8c..d66e524 100644 --- a/lld/test/ELF/linkerscript/sections-nonalloc.s +++ b/lld/test/ELF/linkerscript/sections-nonalloc.s @@ -34,6 +34,11 @@ # RUN: ld.lld -T %t/b.lds %t.o -o %tb # RUN: llvm-readelf -S -l %tb | FileCheck %s --check-prefix=CHECK1 +## --section-start causes the orphan other3 to be considered before .data3. +## The non-alloc other3 does not disable the placement of .data3. +# RUN: ld.lld -T %t/b.lds %t.o -o %tb --section-start=other3=0 +# RUN: llvm-readelf -S -l %tb | FileCheck %s --check-prefix=CHECK1 + # CHECK1: [Nr] Name Type Address Off Size ES Flg Lk # CHECK1-NEXT: [ 0] NULL 0000000000000000 000000 000000 00 0 # CHECK1-NEXT: [ 1] .text PROGBITS 00000000000000b0 0000b0 000001 00 AX 0 diff --git a/lldb/examples/python/crashlog.py b/lldb/examples/python/crashlog.py index 641b2e6..1c0d717 100755 --- a/lldb/examples/python/crashlog.py +++ b/lldb/examples/python/crashlog.py @@ -284,7 +284,9 @@ class CrashLog(symbolication.Symbolicator): """Class that represents a binary images in a darwin crash log""" dsymForUUIDBinary = "/usr/local/bin/dsymForUUID" - if not os.path.exists(dsymForUUIDBinary): + if "LLDB_APPLE_DSYMFORUUID_EXECUTABLE" in os.environ: + dsymForUUIDBinary = os.environ["LLDB_APPLE_DSYMFORUUID_EXECUTABLE"] + elif not os.path.exists(dsymForUUIDBinary): try: dsymForUUIDBinary = ( subprocess.check_output("which dsymForUUID", shell=True) @@ -545,9 +547,9 @@ class CrashLog(symbolication.Symbolicator): for image in self.images: image.resolve = True elif options.crashed_only: + images_to_load = [] for thread in self.threads: - if thread.did_crash(): - images_to_load = [] + if thread.did_crash() or thread.app_specific_backtrace: for ident in thread.idents: for image in self.find_images_with_identifier(ident): image.resolve = True @@ -555,11 +557,15 @@ class CrashLog(symbolication.Symbolicator): futures = [] with tempfile.TemporaryDirectory() as obj_dir: - with concurrent.futures.ThreadPoolExecutor() as executor: - def add_module(image, target, obj_dir): - return image, image.add_module(target, obj_dir) + def add_module(image, target, obj_dir): + return image, image.add_module(target, obj_dir) + max_worker = None + if options.no_parallel_image_loading: + max_worker = 1 + + with concurrent.futures.ThreadPoolExecutor(max_worker) as executor: for image in images_to_load: if image not in loaded_images: if image.uuid == uuid.UUID(int=0): @@ -858,7 +864,7 @@ class JSONCrashLogParser(CrashLogParser): thread = self.crashlog.Thread( len(self.crashlog.threads), True, self.crashlog.process_arch ) - thread.queue = "Application Specific Backtrace" + thread.name = "Application Specific Backtrace" if self.parse_asi_backtrace(thread, json_app_specific_bts[0]): self.crashlog.threads.append(thread) else: @@ -868,7 +874,7 @@ class JSONCrashLogParser(CrashLogParser): thread = self.crashlog.Thread( len(self.crashlog.threads), True, self.crashlog.process_arch ) - thread.queue = "Last Exception Backtrace" + thread.name = "Last Exception Backtrace" self.parse_frames(thread, json_last_exc_bts) self.crashlog.threads.append(thread) @@ -1168,11 +1174,13 @@ class TextCrashLogParser(CrashLogParser): self.thread = self.crashlog.Thread( idx, True, self.crashlog.process_arch ) + self.thread.name = "Application Specific Backtrace" elif line.startswith("Last Exception Backtrace:"): # iOS self.parse_mode = self.CrashLogParseMode.THREAD self.app_specific_backtrace = True idx = 1 self.thread = self.crashlog.Thread(idx, True, self.crashlog.process_arch) + self.thread.name = "Last Exception Backtrace" self.crashlog.info_lines.append(line.strip()) def parse_thread(self, line): @@ -1528,6 +1536,7 @@ def load_crashlog_in_scripted_process(debugger, crashlog_path, options, result): "file_path": crashlog_path, "load_all_images": options.load_all_images, "crashed_only": options.crashed_only, + "no_parallel_image_loading": options.no_parallel_image_loading, } ) ) @@ -1720,6 +1729,13 @@ def CreateSymbolicateCrashLogOptions( help="show source for all threads, not just the crashed thread", default=False, ) + arg_parser.add_argument( + "--no-parallel-image-loading", + dest="no_parallel_image_loading", + action="store_true", + help=argparse.SUPPRESS, + default=False, + ) if add_interactive_options: arg_parser.add_argument( "-i", @@ -1798,6 +1814,9 @@ def SymbolicateCrashLogs(debugger, command_args, result, is_command): ) ) + if "NO_PARALLEL_IMG_LOADING" in os.environ: + options.no_parallel_image_loading = True + if options.version: print(debugger.GetVersionString()) return diff --git a/lldb/examples/python/crashlog_scripted_process.py b/lldb/examples/python/crashlog_scripted_process.py index 26c5c37..be0ed49 100644 --- a/lldb/examples/python/crashlog_scripted_process.py +++ b/lldb/examples/python/crashlog_scripted_process.py @@ -53,6 +53,7 @@ class CrashLogScriptedProcess(ScriptedProcess): class CrashLogOptions: load_all_images = False crashed_only = True + no_parallel_image_loading = False def __init__(self, exe_ctx: lldb.SBExecutionContext, args: lldb.SBStructuredData): super().__init__(exe_ctx, args) @@ -84,6 +85,13 @@ class CrashLogScriptedProcess(ScriptedProcess): if crashed_only.GetType() == lldb.eStructuredDataTypeBoolean: self.options.crashed_only = crashed_only.GetBooleanValue() + no_parallel_image_loading = args.GetValueForKey("no_parallel_image_loading") + if no_parallel_image_loading and no_parallel_image_loading.IsValid(): + if no_parallel_image_loading.GetType() == lldb.eStructuredDataTypeBoolean: + self.options.no_parallel_image_loading = ( + no_parallel_image_loading.GetBooleanValue() + ) + self.pid = super().get_process_id() self.crashed_thread_idx = 0 self.exception = None @@ -165,10 +173,7 @@ class CrashLogScriptedThread(ScriptedThread): self.backing_thread = crashlog_thread self.idx = self.backing_thread.index self.tid = self.backing_thread.id - if self.backing_thread.app_specific_backtrace: - self.name = "Application Specific Backtrace" - else: - self.name = self.backing_thread.name + self.name = self.backing_thread.name self.queue = self.backing_thread.queue self.has_crashed = self.originating_process.crashed_thread_idx == self.idx self.create_stackframes() diff --git a/lldb/include/lldb/Expression/DWARFExpression.h b/lldb/include/lldb/Expression/DWARFExpression.h index 1d85308..e85ba46 100644 --- a/lldb/include/lldb/Expression/DWARFExpression.h +++ b/lldb/include/lldb/Expression/DWARFExpression.h @@ -132,13 +132,12 @@ public: /// \return /// True on success; false otherwise. If error_ptr is non-NULL, /// details of the failure are provided through it. - static bool Evaluate(ExecutionContext *exe_ctx, RegisterContext *reg_ctx, - lldb::ModuleSP module_sp, const DataExtractor &opcodes, - const plugin::dwarf::DWARFUnit *dwarf_cu, - const lldb::RegisterKind reg_set, - const Value *initial_value_ptr, - const Value *object_address_ptr, Value &result, - Status *error_ptr); + static llvm::Expected<Value> + Evaluate(ExecutionContext *exe_ctx, RegisterContext *reg_ctx, + lldb::ModuleSP module_sp, const DataExtractor &opcodes, + const plugin::dwarf::DWARFUnit *dwarf_cu, + const lldb::RegisterKind reg_set, const Value *initial_value_ptr, + const Value *object_address_ptr); static bool ParseDWARFLocationList(const plugin::dwarf::DWARFUnit *dwarf_cu, const DataExtractor &data, diff --git a/lldb/include/lldb/Expression/DWARFExpressionList.h b/lldb/include/lldb/Expression/DWARFExpressionList.h index c2218ad..f711a1c 100644 --- a/lldb/include/lldb/Expression/DWARFExpressionList.h +++ b/lldb/include/lldb/Expression/DWARFExpressionList.h @@ -9,6 +9,7 @@ #ifndef LLDB_EXPRESSION_DWARFEXPRESSIONLIST_H #define LLDB_EXPRESSION_DWARFEXPRESSIONLIST_H +#include "lldb/Core/Value.h" #include "lldb/Expression/DWARFExpression.h" #include "lldb/Utility/RangeMap.h" #include "lldb/lldb-private.h" @@ -113,10 +114,11 @@ public: void SetModule(const lldb::ModuleSP &module) { m_module_wp = module; } - bool Evaluate(ExecutionContext *exe_ctx, RegisterContext *reg_ctx, - lldb::addr_t func_load_addr, const Value *initial_value_ptr, - const Value *object_address_ptr, Value &result, - Status *error_ptr) const; + llvm::Expected<Value> Evaluate(ExecutionContext *exe_ctx, + RegisterContext *reg_ctx, + lldb::addr_t func_load_addr, + const Value *initial_value_ptr, + const Value *object_address_ptr) const; private: // RangeDataVector requires a comparator for DWARFExpression, but it doesn't diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp index 1443d9d..c5c434a 100644 --- a/lldb/source/Core/ValueObject.cpp +++ b/lldb/source/Core/ValueObject.cpp @@ -216,7 +216,7 @@ bool ValueObject::UpdateFormatsIfNeeded() { m_last_format_mgr_revision = DataVisualization::GetCurrentRevision(); any_change = true; - SetValueFormat(DataVisualization::GetFormat(*this, eNoDynamicValues)); + SetValueFormat(DataVisualization::GetFormat(*this, GetDynamicValueType())); SetSummaryFormat( DataVisualization::GetSummaryFormat(*this, GetDynamicValueType())); SetSyntheticChildren( diff --git a/lldb/source/Core/ValueObjectVariable.cpp b/lldb/source/Core/ValueObjectVariable.cpp index 67d71c9..51eb11d 100644 --- a/lldb/source/Core/ValueObjectVariable.cpp +++ b/lldb/source/Core/ValueObjectVariable.cpp @@ -164,8 +164,11 @@ bool ValueObjectVariable::UpdateValue() { target); } Value old_value(m_value); - if (expr_list.Evaluate(&exe_ctx, nullptr, loclist_base_load_addr, nullptr, - nullptr, m_value, &m_error)) { + llvm::Expected<Value> maybe_value = expr_list.Evaluate( + &exe_ctx, nullptr, loclist_base_load_addr, nullptr, nullptr); + + if (maybe_value) { + m_value = *maybe_value; m_resolved_value = m_value; m_value.SetContext(Value::ContextType::Variable, variable); @@ -246,6 +249,7 @@ bool ValueObjectVariable::UpdateValue() { SetValueIsValid(m_error.Success()); } else { + m_error = maybe_value.takeError(); // could not find location, won't allow editing m_resolved_value.SetContext(Value::ContextType::Invalid, nullptr); } diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp index 7473bb8..05767a8 100644 --- a/lldb/source/Expression/DWARFExpression.cpp +++ b/lldb/source/Expression/DWARFExpression.cpp @@ -94,51 +94,38 @@ void DWARFExpression::SetRegisterKind(RegisterKind reg_kind) { m_reg_kind = reg_kind; } - -static bool ReadRegisterValueAsScalar(RegisterContext *reg_ctx, - lldb::RegisterKind reg_kind, - uint32_t reg_num, Status *error_ptr, - Value &value) { - if (reg_ctx == nullptr) { - if (error_ptr) - error_ptr->SetErrorString("No register context in frame.\n"); - } else { - uint32_t native_reg = - reg_ctx->ConvertRegisterKindToRegisterNumber(reg_kind, reg_num); - if (native_reg == LLDB_INVALID_REGNUM) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat("Unable to convert register " - "kind=%u reg_num=%u to a native " - "register number.\n", - reg_kind, reg_num); - } else { - const RegisterInfo *reg_info = - reg_ctx->GetRegisterInfoAtIndex(native_reg); - RegisterValue reg_value; - if (reg_ctx->ReadRegister(reg_info, reg_value)) { - if (reg_value.GetScalarValue(value.GetScalar())) { - value.SetValueType(Value::ValueType::Scalar); - value.SetContext(Value::ContextType::RegisterInfo, - const_cast<RegisterInfo *>(reg_info)); - if (error_ptr) - error_ptr->Clear(); - return true; - } else { - // If we get this error, then we need to implement a value buffer in - // the dwarf expression evaluation function... - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "register %s can't be converted to a scalar value", - reg_info->name); - } - } else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat("register %s is not available", - reg_info->name); - } +static llvm::Error ReadRegisterValueAsScalar(RegisterContext *reg_ctx, + lldb::RegisterKind reg_kind, + uint32_t reg_num, Value &value) { + if (reg_ctx == nullptr) + return llvm::createStringError("no register context in frame"); + + const uint32_t native_reg = + reg_ctx->ConvertRegisterKindToRegisterNumber(reg_kind, reg_num); + if (native_reg == LLDB_INVALID_REGNUM) + return llvm::createStringError( + "unable to convert register kind=%u reg_num=%u to a native " + "register number", + reg_kind, reg_num); + + const RegisterInfo *reg_info = reg_ctx->GetRegisterInfoAtIndex(native_reg); + RegisterValue reg_value; + if (reg_ctx->ReadRegister(reg_info, reg_value)) { + if (reg_value.GetScalarValue(value.GetScalar())) { + value.SetValueType(Value::ValueType::Scalar); + value.SetContext(Value::ContextType::RegisterInfo, + const_cast<RegisterInfo *>(reg_info)); + return llvm::Error::success(); } + + // If we get this error, then we need to implement a value buffer in + // the dwarf expression evaluation function... + return llvm::createStringError( + "register %s can't be converted to a scalar value", reg_info->name); } - return false; + + return llvm::createStringError("register %s is not available", + reg_info->name); } /// Return the length in bytes of the set of operands for \p op. No guarantees @@ -541,12 +528,12 @@ bool DWARFExpression::LinkThreadLocalStorage( return true; } -static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, - ExecutionContext *exe_ctx, - RegisterContext *reg_ctx, - const DataExtractor &opcodes, - lldb::offset_t &opcode_offset, - Status *error_ptr, Log *log) { +static llvm::Error Evaluate_DW_OP_entry_value(std::vector<Value> &stack, + ExecutionContext *exe_ctx, + RegisterContext *reg_ctx, + const DataExtractor &opcodes, + lldb::offset_t &opcode_offset, + Log *log) { // DW_OP_entry_value(sub-expr) describes the location a variable had upon // function entry: this variable location is presumed to be optimized out at // the current PC value. The caller of the function may have call site @@ -593,16 +580,13 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, // 1. Find the function which pushed the current frame onto the stack. if ((!exe_ctx || !exe_ctx->HasTargetScope()) || !reg_ctx) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no exe/reg context"); - return false; + return llvm::createStringError("no exe/reg context"); } StackFrame *current_frame = exe_ctx->GetFramePtr(); Thread *thread = exe_ctx->GetThreadPtr(); - if (!current_frame || !thread) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no current frame/thread"); - return false; - } + if (!current_frame || !thread) + return llvm::createStringError("no current frame/thread"); Target &target = exe_ctx->GetTargetRef(); StackFrameSP parent_frame = nullptr; @@ -620,9 +604,7 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, // parent frame. if (return_pc == LLDB_INVALID_ADDRESS) { return_pc = parent_frame->GetFrameCodeAddress().GetLoadAddress(&target); - LLDB_LOG(log, - "Evaluate_DW_OP_entry_value: immediate ancestor with pc = {0:x}", - return_pc); + LLDB_LOG(log, "immediate ancestor with pc = {0:x}", return_pc); } // If we've found an inlined frame, skip it (these have no call site @@ -634,25 +616,20 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, break; } if (!parent_frame || !parent_frame->GetRegisterContext()) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no parent frame with reg ctx"); - return false; + return llvm::createStringError("no parent frame with reg ctx"); } Function *parent_func = parent_frame->GetSymbolContext(eSymbolContextFunction).function; - if (!parent_func) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no parent function"); - return false; - } + if (!parent_func) + return llvm::createStringError("no parent function"); // 2. Find the call edge in the parent function responsible for creating the // current activation. Function *current_func = current_frame->GetSymbolContext(eSymbolContextFunction).function; - if (!current_func) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no current function"); - return false; - } + if (!current_func) + return llvm::createStringError("no current function"); CallEdge *call_edge = nullptr; ModuleList &modlist = target.GetImages(); @@ -663,17 +640,14 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, // produced by an ambiguous tail call. In this case, refuse to proceed. call_edge = parent_func->GetCallEdgeForReturnAddress(return_pc, target); if (!call_edge) { - LLDB_LOG(log, - "Evaluate_DW_OP_entry_value: no call edge for retn-pc = {0:x} " - "in parent frame {1}", - return_pc, parent_func->GetName()); - return false; + return llvm::createStringError( + llvm::formatv("no call edge for retn-pc = {0:x} in parent frame {1}", + return_pc, parent_func->GetName())); } Function *callee_func = call_edge->GetCallee(modlist, parent_exe_ctx); if (callee_func != current_func) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: ambiguous call sequence, " - "can't find real parent frame"); - return false; + return llvm::createStringError( + "ambiguous call sequence, can't find real parent frame"); } } else { // The StackFrameList solver machinery has deduced that an unambiguous tail @@ -686,21 +660,17 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, } } } - if (!call_edge) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: no unambiguous edge from parent " - "to current function"); - return false; - } + if (!call_edge) + return llvm::createStringError("no unambiguous edge from parent " + "to current function"); // 3. Attempt to locate the DW_OP_entry_value expression in the set of // available call site parameters. If found, evaluate the corresponding // parameter in the context of the parent frame. const uint32_t subexpr_len = opcodes.GetULEB128(&opcode_offset); const void *subexpr_data = opcodes.GetData(&opcode_offset, subexpr_len); - if (!subexpr_data) { - LLDB_LOG(log, "Evaluate_DW_OP_entry_value: subexpr could not be read"); - return false; - } + if (!subexpr_data) + return llvm::createStringError("subexpr could not be read"); const CallSiteParameter *matched_param = nullptr; for (const CallSiteParameter ¶m : call_edge->GetCallSiteParameters()) { @@ -726,28 +696,26 @@ static bool Evaluate_DW_OP_entry_value(std::vector<Value> &stack, break; } } - if (!matched_param) { - LLDB_LOG(log, - "Evaluate_DW_OP_entry_value: no matching call site param found"); - return false; - } + if (!matched_param) + return llvm::createStringError("no matching call site param found"); // TODO: Add support for DW_OP_push_object_address within a DW_OP_entry_value // subexpresion whenever llvm does. - Value result; const DWARFExpressionList ¶m_expr = matched_param->LocationInCaller; - if (!param_expr.Evaluate(&parent_exe_ctx, - parent_frame->GetRegisterContext().get(), - LLDB_INVALID_ADDRESS, - /*initial_value_ptr=*/nullptr, - /*object_address_ptr=*/nullptr, result, error_ptr)) { + + llvm::Expected<Value> maybe_result = param_expr.Evaluate( + &parent_exe_ctx, parent_frame->GetRegisterContext().get(), + LLDB_INVALID_ADDRESS, + /*initial_value_ptr=*/nullptr, + /*object_address_ptr=*/nullptr); + if (!maybe_result) { LLDB_LOG(log, "Evaluate_DW_OP_entry_value: call site param evaluation failed"); - return false; + return maybe_result.takeError(); } - stack.push_back(result); - return true; + stack.push_back(*maybe_result); + return llvm::Error::success(); } namespace { @@ -801,7 +769,6 @@ void UpdateValueTypeFromLocationDescription(Log *log, const DWARFUnit *dwarf_cu, /// /// \param exe_ctx Pointer to the execution context /// \param module_sp shared_ptr contains the module if we have one -/// \param error_ptr pointer to Status object if we have one /// \param dw_op_type C-style string used to vary the error output /// \param file_addr the file address we are trying to resolve and turn into a /// load address @@ -812,32 +779,22 @@ void UpdateValueTypeFromLocationDescription(Log *log, const DWARFUnit *dwarf_cu, /// the load address succeed or an empty Optinal otherwise. If /// check_sectionoffset is true we consider LLDB_INVALID_ADDRESS a /// success if so_addr.IsSectionOffset() is true. -static std::optional<lldb::addr_t> +static llvm::Expected<lldb::addr_t> ResolveLoadAddress(ExecutionContext *exe_ctx, lldb::ModuleSP &module_sp, - Status *error_ptr, const char *dw_op_type, - lldb::addr_t file_addr, Address &so_addr, - bool check_sectionoffset = false) { - if (!module_sp) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "need module to resolve file address for %s", dw_op_type); - return {}; - } + const char *dw_op_type, lldb::addr_t file_addr, + Address &so_addr, bool check_sectionoffset = false) { + if (!module_sp) + return llvm::createStringError("need module to resolve file address for %s", + dw_op_type); - if (!module_sp->ResolveFileAddress(file_addr, so_addr)) { - if (error_ptr) - error_ptr->SetErrorString("failed to resolve file address in module"); - return {}; - } + if (!module_sp->ResolveFileAddress(file_addr, so_addr)) + return llvm::createStringError("failed to resolve file address in module"); - addr_t load_addr = so_addr.GetLoadAddress(exe_ctx->GetTargetPtr()); + const addr_t load_addr = so_addr.GetLoadAddress(exe_ctx->GetTargetPtr()); if (load_addr == LLDB_INVALID_ADDRESS && - (check_sectionoffset && !so_addr.IsSectionOffset())) { - if (error_ptr) - error_ptr->SetErrorString("failed to resolve load address"); - return {}; - } + (check_sectionoffset && !so_addr.IsSectionOffset())) + return llvm::createStringError("failed to resolve load address"); return load_addr; } @@ -862,19 +819,15 @@ static Scalar DerefSizeExtractDataHelper(uint8_t *addr_bytes, return addr_data.GetAddress(&addr_data_offset); } -bool DWARFExpression::Evaluate( +llvm::Expected<Value> DWARFExpression::Evaluate( ExecutionContext *exe_ctx, RegisterContext *reg_ctx, lldb::ModuleSP module_sp, const DataExtractor &opcodes, const DWARFUnit *dwarf_cu, const lldb::RegisterKind reg_kind, - const Value *initial_value_ptr, const Value *object_address_ptr, - Value &result, Status *error_ptr) { + const Value *initial_value_ptr, const Value *object_address_ptr) { - if (opcodes.GetByteSize() == 0) { - if (error_ptr) - error_ptr->SetErrorString( - "no location, value may have been optimized out"); - return false; - } + if (opcodes.GetByteSize() == 0) + return llvm::createStringError( + "no location, value may have been optimized out"); std::vector<Value> stack; Process *process = nullptr; @@ -994,11 +947,9 @@ bool DWARFExpression::Evaluate( // retrieved from the dereferenced address is the size of an address on the // target machine. case DW_OP_deref: { - if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString("Expression stack empty for DW_OP_deref."); - return false; - } + if (stack.empty()) + return llvm::createStringError( + "expression stack empty for DW_OP_deref"); Value::ValueType value_type = stack.back().GetValueType(); switch (value_type) { case Value::ValueType::HostAddress: { @@ -1014,10 +965,10 @@ bool DWARFExpression::Evaluate( Address so_addr; auto maybe_load_addr = ResolveLoadAddress( - exe_ctx, module_sp, error_ptr, "DW_OP_deref", file_addr, so_addr); + exe_ctx, module_sp, "DW_OP_deref", file_addr, so_addr); if (!maybe_load_addr) - return false; + return maybe_load_addr.takeError(); stack.back().GetScalar() = *maybe_load_addr; // Fall through to load address promotion code below. @@ -1041,30 +992,22 @@ bool DWARFExpression::Evaluate( stack.back().GetScalar() = pointer_value; stack.back().ClearContext(); } else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "Failed to dereference pointer from 0x%" PRIx64 - " for DW_OP_deref: %s\n", - pointer_addr, error.AsCString()); - return false; + return llvm::createStringError( + "Failed to dereference pointer from 0x%" PRIx64 + " for DW_OP_deref: %s\n", + pointer_addr, error.AsCString()); } } else { - if (error_ptr) - error_ptr->SetErrorString("NULL process for DW_OP_deref.\n"); - return false; + return llvm::createStringError("NULL process for DW_OP_deref"); } } else { - if (error_ptr) - error_ptr->SetErrorString( - "NULL execution context for DW_OP_deref.\n"); - return false; + return llvm::createStringError( + "NULL execution context for DW_OP_deref"); } break; case Value::ValueType::Invalid: - if (error_ptr) - error_ptr->SetErrorString("Invalid value type for DW_OP_deref.\n"); - return false; + return llvm::createStringError("invalid value type for DW_OP_deref"); } } break; @@ -1083,18 +1026,13 @@ bool DWARFExpression::Evaluate( // expression stack. case DW_OP_deref_size: { if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack empty for DW_OP_deref_size."); - return false; + return llvm::createStringError( + "expression stack empty for DW_OP_deref_size"); } uint8_t size = opcodes.GetU8(&offset); if (size > 8) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "Invalid address size for DW_OP_deref_size: %d\n", - size); - return false; + return llvm::createStringError( + "Invalid address size for DW_OP_deref_size: %d\n", size); } Value::ValueType value_type = stack.back().GetValueType(); switch (value_type) { @@ -1142,13 +1080,12 @@ bool DWARFExpression::Evaluate( auto file_addr = stack.back().GetScalar().ULongLong(LLDB_INVALID_ADDRESS); Address so_addr; - auto maybe_load_addr = - ResolveLoadAddress(exe_ctx, module_sp, error_ptr, - "DW_OP_deref_size", file_addr, so_addr, - /*check_sectionoffset=*/true); + auto maybe_load_addr = ResolveLoadAddress( + exe_ctx, module_sp, "DW_OP_deref_size", file_addr, so_addr, + /*check_sectionoffset=*/true); if (!maybe_load_addr) - return false; + return maybe_load_addr.takeError(); addr_t load_addr = *maybe_load_addr; @@ -1166,12 +1103,10 @@ bool DWARFExpression::Evaluate( stack.back().ClearContext(); break; } else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "Failed to dereference pointer for DW_OP_deref_size: " - "%s\n", - error.AsCString()); - return false; + return llvm::createStringError( + "Failed to dereference pointer for DW_OP_deref_size: " + "%s\n", + error.AsCString()); } } stack.back().GetScalar() = load_addr; @@ -1195,30 +1130,24 @@ bool DWARFExpression::Evaluate( process->GetByteOrder(), size); stack.back().ClearContext(); } else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "Failed to dereference pointer from 0x%" PRIx64 - " for DW_OP_deref: %s\n", - pointer_addr, error.AsCString()); - return false; + return llvm::createStringError( + "Failed to dereference pointer from 0x%" PRIx64 + " for DW_OP_deref: %s\n", + pointer_addr, error.AsCString()); } } else { - if (error_ptr) - error_ptr->SetErrorString("NULL process for DW_OP_deref_size.\n"); - return false; + + return llvm::createStringError("NULL process for DW_OP_deref_size"); } } else { - if (error_ptr) - error_ptr->SetErrorString( - "NULL execution context for DW_OP_deref_size.\n"); - return false; + return llvm::createStringError( + "NULL execution context for DW_OP_deref_size"); } break; case Value::ValueType::Invalid: - if (error_ptr) - error_ptr->SetErrorString("Invalid value for DW_OP_deref_size.\n"); - return false; + + return llvm::createStringError("invalid value for DW_OP_deref_size"); } } break; @@ -1239,9 +1168,7 @@ bool DWARFExpression::Evaluate( // extended to the size of an address on the target machine before being // pushed on the expression stack. case DW_OP_xderef_size: - if (error_ptr) - error_ptr->SetErrorString("Unimplemented opcode: DW_OP_xderef_size."); - return false; + return llvm::createStringError("unimplemented opcode: DW_OP_xderef_size"); // OPCODE: DW_OP_xderef // OPERANDS: none // DESCRIPTION: Provides an extended dereference mechanism. The entry at @@ -1253,9 +1180,7 @@ bool DWARFExpression::Evaluate( // retrieved from the dereferenced address is the size of an address on the // target machine. case DW_OP_xderef: - if (error_ptr) - error_ptr->SetErrorString("Unimplemented opcode: DW_OP_xderef."); - return false; + return llvm::createStringError("unimplemented opcode: DW_OP_xderef"); // All DW_OP_constXXX opcodes have a single operand as noted below: // @@ -1308,9 +1233,7 @@ bool DWARFExpression::Evaluate( // DESCRIPTION: duplicates the value at the top of the stack case DW_OP_dup: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString("Expression stack empty for DW_OP_dup."); - return false; + return llvm::createStringError("expression stack empty for DW_OP_dup"); } else stack.push_back(stack.back()); break; @@ -1320,9 +1243,7 @@ bool DWARFExpression::Evaluate( // DESCRIPTION: pops the value at the top of the stack case DW_OP_drop: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString("Expression stack empty for DW_OP_drop."); - return false; + return llvm::createStringError("expression stack empty for DW_OP_drop"); } else stack.pop_back(); break; @@ -1333,10 +1254,8 @@ bool DWARFExpression::Evaluate( // the top of the stack. case DW_OP_over: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_over."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_over"); } else stack.push_back(stack[stack.size() - 2]); break; @@ -1350,10 +1269,8 @@ bool DWARFExpression::Evaluate( if (pick_idx < stack.size()) stack.push_back(stack[stack.size() - 1 - pick_idx]); else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "Index %u out of range for DW_OP_pick.\n", pick_idx); - return false; + return llvm::createStringError( + "Index %u out of range for DW_OP_pick.\n", pick_idx); } } break; @@ -1364,10 +1281,8 @@ bool DWARFExpression::Evaluate( // becomes the top of the stack case DW_OP_swap: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_swap."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_swap"); } else { tmp = stack.back(); stack.back() = stack[stack.size() - 2]; @@ -1383,10 +1298,8 @@ bool DWARFExpression::Evaluate( // entry. case DW_OP_rot: if (stack.size() < 3) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 3 items for DW_OP_rot."); - return false; + return llvm::createStringError( + "expression stack needs at least 3 items for DW_OP_rot"); } else { size_t last_idx = stack.size() - 1; Value old_top = stack[last_idx]; @@ -1403,15 +1316,11 @@ bool DWARFExpression::Evaluate( // represented, the result is undefined. case DW_OP_abs: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_abs."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_abs"); } else if (!stack.back().ResolveValue(exe_ctx).AbsoluteValue()) { - if (error_ptr) - error_ptr->SetErrorString( - "Failed to take the absolute value of the first stack item."); - return false; + return llvm::createStringError( + "failed to take the absolute value of the first stack item"); } break; @@ -1421,10 +1330,8 @@ bool DWARFExpression::Evaluate( // operation on the two, and pushes the result. case DW_OP_and: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_and."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_and"); } else { tmp = stack.back(); stack.pop_back(); @@ -1440,30 +1347,23 @@ bool DWARFExpression::Evaluate( // the result. case DW_OP_div: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_div."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_div"); } else { tmp = stack.back(); - if (tmp.ResolveValue(exe_ctx).IsZero()) { - if (error_ptr) - error_ptr->SetErrorString("Divide by zero."); - return false; - } else { - stack.pop_back(); - Scalar divisor, dividend; - divisor = tmp.ResolveValue(exe_ctx); - dividend = stack.back().ResolveValue(exe_ctx); - divisor.MakeSigned(); - dividend.MakeSigned(); - stack.back() = dividend / divisor; - if (!stack.back().ResolveValue(exe_ctx).IsValid()) { - if (error_ptr) - error_ptr->SetErrorString("Divide failed."); - return false; - } - } + if (tmp.ResolveValue(exe_ctx).IsZero()) + return llvm::createStringError("divide by zero"); + + stack.pop_back(); + Scalar divisor, dividend; + divisor = tmp.ResolveValue(exe_ctx); + dividend = stack.back().ResolveValue(exe_ctx); + divisor.MakeSigned(); + dividend.MakeSigned(); + stack.back() = dividend / divisor; + + if (!stack.back().ResolveValue(exe_ctx).IsValid()) + return llvm::createStringError("divide failed"); } break; @@ -1473,10 +1373,8 @@ bool DWARFExpression::Evaluate( // of the stack from the former second entry, and pushes the result. case DW_OP_minus: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_minus."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_minus"); } else { tmp = stack.back(); stack.pop_back(); @@ -1492,10 +1390,8 @@ bool DWARFExpression::Evaluate( // stack. case DW_OP_mod: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_mod."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_mod"); } else { tmp = stack.back(); stack.pop_back(); @@ -1510,10 +1406,8 @@ bool DWARFExpression::Evaluate( // together, and pushes the result. case DW_OP_mul: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_mul."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_mul"); } else { tmp = stack.back(); stack.pop_back(); @@ -1527,16 +1421,11 @@ bool DWARFExpression::Evaluate( // DESCRIPTION: pops the top stack entry, and pushes its negation. case DW_OP_neg: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_neg."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_neg"); } else { - if (!stack.back().ResolveValue(exe_ctx).UnaryNegate()) { - if (error_ptr) - error_ptr->SetErrorString("Unary negate failed."); - return false; - } + if (!stack.back().ResolveValue(exe_ctx).UnaryNegate()) + return llvm::createStringError("unary negate failed"); } break; @@ -1546,15 +1435,11 @@ bool DWARFExpression::Evaluate( // complement case DW_OP_not: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_not."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_not"); } else { if (!stack.back().ResolveValue(exe_ctx).OnesComplement()) { - if (error_ptr) - error_ptr->SetErrorString("Logical NOT failed."); - return false; + return llvm::createStringError("logical NOT failed"); } } break; @@ -1565,10 +1450,8 @@ bool DWARFExpression::Evaluate( // operation on the two, and pushes the result. case DW_OP_or: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_or."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_or"); } else { tmp = stack.back(); stack.pop_back(); @@ -1583,10 +1466,8 @@ bool DWARFExpression::Evaluate( // pushes the result. case DW_OP_plus: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_plus."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_plus"); } else { tmp = stack.back(); stack.pop_back(); @@ -1600,19 +1481,14 @@ bool DWARFExpression::Evaluate( // constant operand and pushes the result. case DW_OP_plus_uconst: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_plus_uconst."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_plus_uconst"); } else { const uint64_t uconst_value = opcodes.GetULEB128(&offset); // Implicit conversion from a UINT to a Scalar... stack.back().GetScalar() += uconst_value; - if (!stack.back().GetScalar().IsValid()) { - if (error_ptr) - error_ptr->SetErrorString("DW_OP_plus_uconst failed."); - return false; - } + if (!stack.back().GetScalar().IsValid()) + return llvm::createStringError("DW_OP_plus_uconst failed"); } break; @@ -1623,10 +1499,8 @@ bool DWARFExpression::Evaluate( // the stack, and pushes the result. case DW_OP_shl: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_shl."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_shl"); } else { tmp = stack.back(); stack.pop_back(); @@ -1641,18 +1515,14 @@ bool DWARFExpression::Evaluate( // specified by the former top of the stack, and pushes the result. case DW_OP_shr: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_shr."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_shr"); } else { tmp = stack.back(); stack.pop_back(); if (!stack.back().ResolveValue(exe_ctx).ShiftRightLogical( tmp.ResolveValue(exe_ctx))) { - if (error_ptr) - error_ptr->SetErrorString("DW_OP_shr failed."); - return false; + return llvm::createStringError("DW_OP_shr failed"); } } break; @@ -1665,10 +1535,8 @@ bool DWARFExpression::Evaluate( // of the stack, and pushes the result. case DW_OP_shra: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_shra."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_shra"); } else { tmp = stack.back(); stack.pop_back(); @@ -1682,10 +1550,8 @@ bool DWARFExpression::Evaluate( // exclusive-or operation on the two, and pushes the result. case DW_OP_xor: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_xor."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_xor"); } else { tmp = stack.back(); stack.pop_back(); @@ -1709,11 +1575,9 @@ bool DWARFExpression::Evaluate( if (new_offset <= opcodes.GetByteSize()) offset = new_offset; else { - if (error_ptr) - error_ptr->SetErrorStringWithFormatv( - "Invalid opcode offset in DW_OP_skip: {0}+({1}) > {2}", offset, - skip_offset, opcodes.GetByteSize()); - return false; + return llvm::createStringError(llvm::formatv( + "Invalid opcode offset in DW_OP_skip: {0}+({1}) > {2}", offset, + skip_offset, opcodes.GetByteSize())); } } break; @@ -1726,10 +1590,8 @@ bool DWARFExpression::Evaluate( // the current operation, beginning after the 2-byte constant. case DW_OP_bra: if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_bra."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_bra"); } else { tmp = stack.back(); stack.pop_back(); @@ -1743,11 +1605,9 @@ bool DWARFExpression::Evaluate( if (new_offset <= opcodes.GetByteSize()) offset = new_offset; else { - if (error_ptr) - error_ptr->SetErrorStringWithFormatv( - "Invalid opcode offset in DW_OP_bra: {0}+({1}) > {2}", offset, - bra_offset, opcodes.GetByteSize()); - return false; + return llvm::createStringError(llvm::formatv( + "Invalid opcode offset in DW_OP_bra: {0}+({1}) > {2}", offset, + bra_offset, opcodes.GetByteSize())); } } } @@ -1762,10 +1622,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_eq: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_eq."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_eq"); } else { tmp = stack.back(); stack.pop_back(); @@ -1783,10 +1641,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_ge: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_ge."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_ge"); } else { tmp = stack.back(); stack.pop_back(); @@ -1804,10 +1660,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_gt: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_gt."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_gt"); } else { tmp = stack.back(); stack.pop_back(); @@ -1825,10 +1679,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_le: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_le."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_le"); } else { tmp = stack.back(); stack.pop_back(); @@ -1846,10 +1698,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_lt: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_lt."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_lt"); } else { tmp = stack.back(); stack.pop_back(); @@ -1867,10 +1717,8 @@ bool DWARFExpression::Evaluate( // operation is false. case DW_OP_ne: if (stack.size() < 2) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 2 items for DW_OP_ne."); - return false; + return llvm::createStringError( + "expression stack needs at least 2 items for DW_OP_ne"); } else { tmp = stack.back(); stack.pop_back(); @@ -1957,10 +1805,10 @@ bool DWARFExpression::Evaluate( dwarf4_location_description_kind = Register; reg_num = op - DW_OP_reg0; - if (ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, error_ptr, tmp)) - stack.push_back(tmp); - else - return false; + if (llvm::Error err = + ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, tmp)) + return err; + stack.push_back(tmp); } break; // OPCODE: DW_OP_regx // OPERANDS: @@ -1969,10 +1817,11 @@ bool DWARFExpression::Evaluate( case DW_OP_regx: { dwarf4_location_description_kind = Register; reg_num = opcodes.GetULEB128(&offset); - if (ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, error_ptr, tmp)) - stack.push_back(tmp); - else - return false; + Status read_err; + if (llvm::Error err = + ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, tmp)) + return err; + stack.push_back(tmp); } break; // OPCODE: DW_OP_bregN @@ -2013,16 +1862,15 @@ bool DWARFExpression::Evaluate( case DW_OP_breg30: case DW_OP_breg31: { reg_num = op - DW_OP_breg0; - - if (ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, error_ptr, - tmp)) { - int64_t breg_offset = opcodes.GetSLEB128(&offset); - tmp.ResolveValue(exe_ctx) += (uint64_t)breg_offset; - tmp.ClearContext(); - stack.push_back(tmp); - stack.back().SetValueType(Value::ValueType::LoadAddress); - } else - return false; + if (llvm::Error err = + ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, tmp)) + return err; + + int64_t breg_offset = opcodes.GetSLEB128(&offset); + tmp.ResolveValue(exe_ctx) += (uint64_t)breg_offset; + tmp.ClearContext(); + stack.push_back(tmp); + stack.back().SetValueType(Value::ValueType::LoadAddress); } break; // OPCODE: DW_OP_bregx // OPERANDS: 2 @@ -2032,40 +1880,36 @@ bool DWARFExpression::Evaluate( // N plus an offset. case DW_OP_bregx: { reg_num = opcodes.GetULEB128(&offset); - - if (ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, error_ptr, - tmp)) { - int64_t breg_offset = opcodes.GetSLEB128(&offset); - tmp.ResolveValue(exe_ctx) += (uint64_t)breg_offset; - tmp.ClearContext(); - stack.push_back(tmp); - stack.back().SetValueType(Value::ValueType::LoadAddress); - } else - return false; + if (llvm::Error err = + ReadRegisterValueAsScalar(reg_ctx, reg_kind, reg_num, tmp)) + return err; + + int64_t breg_offset = opcodes.GetSLEB128(&offset); + tmp.ResolveValue(exe_ctx) += (uint64_t)breg_offset; + tmp.ClearContext(); + stack.push_back(tmp); + stack.back().SetValueType(Value::ValueType::LoadAddress); } break; case DW_OP_fbreg: if (exe_ctx) { if (frame) { Scalar value; - if (frame->GetFrameBaseValue(value, error_ptr)) { + Status fb_err; + if (frame->GetFrameBaseValue(value, &fb_err)) { int64_t fbreg_offset = opcodes.GetSLEB128(&offset); value += fbreg_offset; stack.push_back(value); stack.back().SetValueType(Value::ValueType::LoadAddress); } else - return false; + return fb_err.ToError(); } else { - if (error_ptr) - error_ptr->SetErrorString( - "Invalid stack frame in context for DW_OP_fbreg opcode."); - return false; + return llvm::createStringError( + "invalid stack frame in context for DW_OP_fbreg opcode"); } } else { - if (error_ptr) - error_ptr->SetErrorString( - "NULL execution context for DW_OP_fbreg.\n"); - return false; + return llvm::createStringError( + "NULL execution context for DW_OP_fbreg"); } break; @@ -2127,7 +1971,7 @@ bool DWARFExpression::Evaluate( const lldb::addr_t addr = scalar.ULongLong(LLDB_INVALID_ADDRESS); switch (curr_piece_source_value_type) { case Value::ValueType::Invalid: - return false; + return llvm::createStringError("invalid value type"); case Value::ValueType::LoadAddress: case Value::ValueType::FileAddress: { if (target) { @@ -2136,35 +1980,28 @@ bool DWARFExpression::Evaluate( piece_byte_size, error, /*force_live_memory=*/false) != piece_byte_size) { - if (error_ptr) { - const char *addr_type = (curr_piece_source_value_type == - Value::ValueType::LoadAddress) - ? "load" - : "file"; - error_ptr->SetErrorStringWithFormat( - "failed to read memory DW_OP_piece(%" PRIu64 - ") from %s address 0x%" PRIx64, - piece_byte_size, addr_type, addr); - } - return false; + const char *addr_type = (curr_piece_source_value_type == + Value::ValueType::LoadAddress) + ? "load" + : "file"; + return llvm::createStringError( + "failed to read memory DW_OP_piece(%" PRIu64 + ") from %s address 0x%" PRIx64, + piece_byte_size, addr_type, addr); } } else { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "failed to resize the piece memory buffer for " - "DW_OP_piece(%" PRIu64 ")", - piece_byte_size); - return false; + return llvm::createStringError( + "failed to resize the piece memory buffer for " + "DW_OP_piece(%" PRIu64 ")", + piece_byte_size); } } } break; case Value::ValueType::HostAddress: { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "failed to read memory DW_OP_piece(%" PRIu64 - ") from host address 0x%" PRIx64, - piece_byte_size, addr); - return false; + return llvm::createStringError( + "failed to read memory DW_OP_piece(%" PRIu64 + ") from host address 0x%" PRIx64, + piece_byte_size, addr); } break; case Value::ValueType::Scalar: { @@ -2172,14 +2009,11 @@ bool DWARFExpression::Evaluate( uint32_t bit_offset = 0; if (!scalar.ExtractBitfield( bit_size, bit_offset)) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "unable to extract %" PRIu64 " bytes from a %" PRIu64 - " byte scalar value.", - piece_byte_size, - (uint64_t)curr_piece_source_value.GetScalar() - .GetByteSize()); - return false; + return llvm::createStringError( + "unable to extract %" PRIu64 " bytes from a %" PRIu64 + " byte scalar value.", + piece_byte_size, + (uint64_t)curr_piece_source_value.GetScalar().GetByteSize()); } // Create curr_piece with bit_size. By default Scalar // grows to the nearest host integer type. @@ -2198,27 +2032,20 @@ bool DWARFExpression::Evaluate( // so subsequent pieces will be able to access this piece and add // to it. if (pieces.AppendDataToHostBuffer(curr_piece) == 0) { - if (error_ptr) - error_ptr->SetErrorString("failed to append piece data"); - return false; + return llvm::createStringError("failed to append piece data"); } } else { // If this is the second or later piece there should be a value on // the stack. if (pieces.GetBuffer().GetByteSize() != op_piece_offset) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "DW_OP_piece for offset %" PRIu64 - " but top of stack is of size %" PRIu64, - op_piece_offset, pieces.GetBuffer().GetByteSize()); - return false; + return llvm::createStringError( + "DW_OP_piece for offset %" PRIu64 + " but top of stack is of size %" PRIu64, + op_piece_offset, pieces.GetBuffer().GetByteSize()); } - if (pieces.AppendDataToHostBuffer(curr_piece) == 0) { - if (error_ptr) - error_ptr->SetErrorString("failed to append piece data"); - return false; - } + if (pieces.AppendDataToHostBuffer(curr_piece) == 0) + return llvm::createStringError("failed to append piece data"); } } op_piece_offset += piece_byte_size; @@ -2231,10 +2058,8 @@ bool DWARFExpression::Evaluate( LocationDescriptionKind::Empty); // Reset for the next piece. dwarf4_location_description_kind = Memory; - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_bit_piece."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_bit_piece"); } else { UpdateValueTypeFromLocationDescription( log, dwarf_cu, dwarf4_location_description_kind, &stack.back()); @@ -2244,30 +2069,26 @@ bool DWARFExpression::Evaluate( const uint64_t piece_bit_offset = opcodes.GetULEB128(&offset); switch (stack.back().GetValueType()) { case Value::ValueType::Invalid: - return false; + return llvm::createStringError( + "unable to extract bit value from invalid value"); case Value::ValueType::Scalar: { if (!stack.back().GetScalar().ExtractBitfield(piece_bit_size, piece_bit_offset)) { - if (error_ptr) - error_ptr->SetErrorStringWithFormat( - "unable to extract %" PRIu64 " bit value with %" PRIu64 - " bit offset from a %" PRIu64 " bit scalar value.", - piece_bit_size, piece_bit_offset, - (uint64_t)(stack.back().GetScalar().GetByteSize() * 8)); - return false; + return llvm::createStringError( + "unable to extract %" PRIu64 " bit value with %" PRIu64 + " bit offset from a %" PRIu64 " bit scalar value.", + piece_bit_size, piece_bit_offset, + (uint64_t)(stack.back().GetScalar().GetByteSize() * 8)); } } break; case Value::ValueType::FileAddress: case Value::ValueType::LoadAddress: case Value::ValueType::HostAddress: - if (error_ptr) { - error_ptr->SetErrorStringWithFormat( - "unable to extract DW_OP_bit_piece(bit_size = %" PRIu64 - ", bit_offset = %" PRIu64 ") from an address value.", - piece_bit_size, piece_bit_offset); - } - return false; + return llvm::createStringError( + "unable to extract DW_OP_bit_piece(bit_size = %" PRIu64 + ", bit_offset = %" PRIu64 ") from an address value.", + piece_bit_size, piece_bit_offset); } } break; @@ -2287,9 +2108,8 @@ bool DWARFExpression::Evaluate( if (!data) { LLDB_LOG(log, "Evaluate_DW_OP_implicit_value: could not be read data"); - LLDB_ERRORF(error_ptr, "Could not evaluate %s.", - DW_OP_value_to_name(op)); - return false; + return llvm::createStringError("could not evaluate %s", + DW_OP_value_to_name(op)); } Value result(data, len); @@ -2299,8 +2119,8 @@ bool DWARFExpression::Evaluate( case DW_OP_implicit_pointer: { dwarf4_location_description_kind = Implicit; - LLDB_ERRORF(error_ptr, "Could not evaluate %s.", DW_OP_value_to_name(op)); - return false; + return llvm::createStringError("Could not evaluate %s.", + DW_OP_value_to_name(op)); } // OPCODE: DW_OP_push_object_address @@ -2315,10 +2135,8 @@ bool DWARFExpression::Evaluate( if (object_address_ptr) stack.push_back(*object_address_ptr); else { - if (error_ptr) - error_ptr->SetErrorString("DW_OP_push_object_address used without " - "specifying an object address"); - return false; + return llvm::createStringError("DW_OP_push_object_address used without " + "specifying an object address"); } break; @@ -2341,9 +2159,7 @@ bool DWARFExpression::Evaluate( // the stack by the called expression may be used as return values by prior // agreement between the calling and called expressions. case DW_OP_call2: - if (error_ptr) - error_ptr->SetErrorString("Unimplemented opcode DW_OP_call2."); - return false; + return llvm::createStringError("unimplemented opcode DW_OP_call2"); // OPCODE: DW_OP_call4 // OPERANDS: 1 // uint32_t compile unit relative offset of a DIE @@ -2364,9 +2180,7 @@ bool DWARFExpression::Evaluate( // the stack by the called expression may be used as return values by prior // agreement between the calling and called expressions. case DW_OP_call4: - if (error_ptr) - error_ptr->SetErrorString("Unimplemented opcode DW_OP_call4."); - return false; + return llvm::createStringError("unimplemented opcode DW_OP_call4"); // OPCODE: DW_OP_stack_value // OPERANDS: None @@ -2375,12 +2189,9 @@ bool DWARFExpression::Evaluate( // value to be used. This is the actual object value and not the location. case DW_OP_stack_value: dwarf4_location_description_kind = Implicit; - if (stack.empty()) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_stack_value."); - return false; - } + if (stack.empty()) + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_stack_value"); stack.back().SetValueType(Value::ValueType::Scalar); break; @@ -2393,10 +2204,8 @@ bool DWARFExpression::Evaluate( // different type, and push the result. case DW_OP_convert: { if (stack.size() < 1) { - if (error_ptr) - error_ptr->SetErrorString( - "Expression stack needs at least 1 item for DW_OP_convert."); - return false; + return llvm::createStringError( + "expression stack needs at least 1 item for DW_OP_convert"); } const uint64_t die_offset = opcodes.GetULEB128(&offset); uint64_t bit_size; @@ -2405,39 +2214,29 @@ bool DWARFExpression::Evaluate( // The generic type has the size of an address on the target // machine and an unspecified signedness. Scalar has no // "unspecified signedness", so we use unsigned types. - if (!module_sp) { - if (error_ptr) - error_ptr->SetErrorString("No module"); - return false; - } + if (!module_sp) + return llvm::createStringError("no module"); sign = false; bit_size = module_sp->GetArchitecture().GetAddressByteSize() * 8; - if (!bit_size) { - if (error_ptr) - error_ptr->SetErrorString("unspecified architecture"); - return false; - } + if (!bit_size) + return llvm::createStringError("unspecified architecture"); } else { // Retrieve the type DIE that the value is being converted to. This // offset is compile unit relative so we need to fix it up. const uint64_t abs_die_offset = die_offset + dwarf_cu->GetOffset(); // FIXME: the constness has annoying ripple effects. DWARFDIE die = const_cast<DWARFUnit *>(dwarf_cu)->GetDIE(abs_die_offset); - if (!die) { - if (error_ptr) - error_ptr->SetErrorString("Cannot resolve DW_OP_convert type DIE"); - return false; - } + if (!die) + return llvm::createStringError( + "cannot resolve DW_OP_convert type DIE"); uint64_t encoding = die.GetAttributeValueAsUnsigned(DW_AT_encoding, DW_ATE_hi_user); bit_size = die.GetAttributeValueAsUnsigned(DW_AT_byte_size, 0) * 8; if (!bit_size) bit_size = die.GetAttributeValueAsUnsigned(DW_AT_bit_size, 0); - if (!bit_size) { - if (error_ptr) - error_ptr->SetErrorString("Unsupported type size in DW_OP_convert"); - return false; - } + if (!bit_size) + return llvm::createStringError( + "unsupported type size in DW_OP_convert"); switch (encoding) { case DW_ATE_signed: case DW_ATE_signed_char: @@ -2448,9 +2247,8 @@ bool DWARFExpression::Evaluate( sign = false; break; default: - if (error_ptr) - error_ptr->SetErrorString("Unsupported encoding in DW_OP_convert"); - return false; + return llvm::createStringError( + "unsupported encoding in DW_OP_convert"); } } Scalar &top = stack.back().ResolveValue(exe_ctx); @@ -2472,15 +2270,15 @@ bool DWARFExpression::Evaluate( if (cfa != LLDB_INVALID_ADDRESS) { stack.push_back(Scalar(cfa)); stack.back().SetValueType(Value::ValueType::LoadAddress); - } else if (error_ptr) - error_ptr->SetErrorString("Stack frame does not include a canonical " - "frame address for DW_OP_call_frame_cfa " - "opcode."); + } else { + return llvm::createStringError( + "stack frame does not include a canonical " + "frame address for DW_OP_call_frame_cfa " + "opcode"); + } } else { - if (error_ptr) - error_ptr->SetErrorString("Invalid stack frame in context for " - "DW_OP_call_frame_cfa opcode."); - return false; + return llvm::createStringError("unvalid stack frame in context for " + "DW_OP_call_frame_cfa opcode"); } break; @@ -2493,29 +2291,20 @@ bool DWARFExpression::Evaluate( case DW_OP_form_tls_address: case DW_OP_GNU_push_tls_address: { if (stack.size() < 1) { - if (error_ptr) { - if (op == DW_OP_form_tls_address) - error_ptr->SetErrorString( - "DW_OP_form_tls_address needs an argument."); - else - error_ptr->SetErrorString( - "DW_OP_GNU_push_tls_address needs an argument."); - } - return false; + if (op == DW_OP_form_tls_address) + return llvm::createStringError( + "DW_OP_form_tls_address needs an argument"); + else + return llvm::createStringError( + "DW_OP_GNU_push_tls_address needs an argument"); } - if (!exe_ctx || !module_sp) { - if (error_ptr) - error_ptr->SetErrorString("No context to evaluate TLS within."); - return false; - } + if (!exe_ctx || !module_sp) + return llvm::createStringError("no context to evaluate TLS within"); Thread *thread = exe_ctx->GetThreadPtr(); - if (!thread) { - if (error_ptr) - error_ptr->SetErrorString("No thread to evaluate TLS within."); - return false; - } + if (!thread) + return llvm::createStringError("no thread to evaluate TLS within"); // Lookup the TLS block address for this thread and module. const addr_t tls_file_addr = @@ -2523,12 +2312,9 @@ bool DWARFExpression::Evaluate( const addr_t tls_load_addr = thread->GetThreadLocalData(module_sp, tls_file_addr); - if (tls_load_addr == LLDB_INVALID_ADDRESS) { - if (error_ptr) - error_ptr->SetErrorString( - "No TLS data currently exists for this thread."); - return false; - } + if (tls_load_addr == LLDB_INVALID_ADDRESS) + return llvm::createStringError( + "no TLS data currently exists for this thread"); stack.back().GetScalar() = tls_load_addr; stack.back().SetValueType(Value::ValueType::LoadAddress); @@ -2542,12 +2328,9 @@ bool DWARFExpression::Evaluate( // and the 0 based index is the ULEB128 encoded index. case DW_OP_addrx: case DW_OP_GNU_addr_index: { - if (!dwarf_cu) { - if (error_ptr) - error_ptr->SetErrorString("DW_OP_GNU_addr_index found without a " - "compile unit being specified"); - return false; - } + if (!dwarf_cu) + return llvm::createStringError("DW_OP_GNU_addr_index found without a " + "compile unit being specified"); uint64_t index = opcodes.GetULEB128(&offset); lldb::addr_t value = dwarf_cu->ReadAddressFromDebugAddrSection(index); stack.push_back(Scalar(value)); @@ -2570,10 +2353,8 @@ bool DWARFExpression::Evaluate( // encoded index. case DW_OP_GNU_const_index: { if (!dwarf_cu) { - if (error_ptr) - error_ptr->SetErrorString("DW_OP_GNU_const_index found without a " - "compile unit being specified"); - return false; + return llvm::createStringError("DW_OP_GNU_const_index found without a " + "compile unit being specified"); } uint64_t index = opcodes.GetULEB128(&offset); lldb::addr_t value = dwarf_cu->ReadAddressFromDebugAddrSection(index); @@ -2582,12 +2363,11 @@ bool DWARFExpression::Evaluate( case DW_OP_GNU_entry_value: case DW_OP_entry_value: { - if (!Evaluate_DW_OP_entry_value(stack, exe_ctx, reg_ctx, opcodes, offset, - error_ptr, log)) { - LLDB_ERRORF(error_ptr, "Could not evaluate %s.", - DW_OP_value_to_name(op)); - return false; - } + if (llvm::Error err = Evaluate_DW_OP_entry_value(stack, exe_ctx, reg_ctx, + opcodes, offset, log)) + return llvm::createStringError( + "could not evaluate DW_OP_entry_value: %s", + llvm::toString(std::move(err)).c_str()); break; } @@ -2598,23 +2378,18 @@ bool DWARFExpression::Evaluate( break; } } - if (error_ptr) - error_ptr->SetErrorStringWithFormatv( - "Unhandled opcode {0} in DWARFExpression", LocationAtom(op)); - return false; + return llvm::createStringError(llvm::formatv( + "Unhandled opcode {0} in DWARFExpression", LocationAtom(op))); } } if (stack.empty()) { // Nothing on the stack, check if we created a piece value from DW_OP_piece // or DW_OP_bit_piece opcodes - if (pieces.GetBuffer().GetByteSize()) { - result = pieces; - return true; - } - if (error_ptr) - error_ptr->SetErrorString("Stack empty after evaluation."); - return false; + if (pieces.GetBuffer().GetByteSize()) + return pieces; + + return llvm::createStringError("stack empty after evaluation"); } UpdateValueTypeFromLocationDescription( @@ -2631,8 +2406,7 @@ bool DWARFExpression::Evaluate( LLDB_LOGF(log, " %s", new_value.GetData()); } } - result = stack.back(); - return true; // Return true on success + return stack.back(); } bool DWARFExpression::ParseDWARFLocationList( diff --git a/lldb/source/Expression/DWARFExpressionList.cpp b/lldb/source/Expression/DWARFExpressionList.cpp index cba4e4e..7a5cf9f 100644 --- a/lldb/source/Expression/DWARFExpressionList.cpp +++ b/lldb/source/Expression/DWARFExpressionList.cpp @@ -198,12 +198,10 @@ void DWARFExpressionList::GetDescription(Stream *s, } } -bool DWARFExpressionList::Evaluate(ExecutionContext *exe_ctx, - RegisterContext *reg_ctx, - lldb::addr_t func_load_addr, - const Value *initial_value_ptr, - const Value *object_address_ptr, - Value &result, Status *error_ptr) const { +llvm::Expected<Value> DWARFExpressionList::Evaluate( + ExecutionContext *exe_ctx, RegisterContext *reg_ctx, + lldb::addr_t func_load_addr, const Value *initial_value_ptr, + const Value *object_address_ptr) const { ModuleSP module_sp = m_module_wp.lock(); DataExtractor data; RegisterKind reg_kind; @@ -217,32 +215,26 @@ bool DWARFExpressionList::Evaluate(ExecutionContext *exe_ctx, if (exe_ctx) frame = exe_ctx->GetFramePtr(); if (!frame) - return false; + return llvm::createStringError("no frame"); RegisterContextSP reg_ctx_sp = frame->GetRegisterContext(); if (!reg_ctx_sp) - return false; + return llvm::createStringError("no register context"); reg_ctx_sp->GetPCForSymbolication(pc); } if (!pc.IsValid()) { - if (error_ptr) - error_ptr->SetErrorString("Invalid PC in frame."); - return false; + return llvm::createStringError("Invalid PC in frame."); } addr_t pc_load_addr = pc.GetLoadAddress(exe_ctx->GetTargetPtr()); const DWARFExpression *entry = GetExpressionAtAddress(func_load_addr, pc_load_addr); - if (!entry) { - if (error_ptr) { - error_ptr->SetErrorString("variable not available"); - } - return false; - } + if (!entry) + return llvm::createStringError("variable not available"); expr = *entry; } expr.GetExpressionData(data); reg_kind = expr.GetRegisterKind(); return DWARFExpression::Evaluate(exe_ctx, reg_ctx, module_sp, data, m_dwarf_cu, reg_kind, initial_value_ptr, - object_address_ptr, result, error_ptr); + object_address_ptr); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h index e144cf0..66db396 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h @@ -60,8 +60,6 @@ public: virtual ConstString GetDIEClassTemplateParams(const DWARFDIE &die) = 0; - virtual lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) = 0; - static std::optional<SymbolFile::ArrayInfo> ParseChildArrayInfo(const DWARFDIE &parent_die, const ExecutionContext *exe_ctx = nullptr); diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp index dc4cfc9..579a538 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp @@ -154,26 +154,6 @@ static bool TagIsRecordType(dw_tag_t tag) { } } -static bool IsForwardDeclaration(const DWARFDIE &die, - const ParsedDWARFTypeAttributes &attrs, - LanguageType cu_language) { - if (attrs.is_forward_declaration) - return true; - - // Work around an issue with clang at the moment where forward - // declarations for objective C classes are emitted as: - // DW_TAG_structure_type [2] - // DW_AT_name( "ForwardObjcClass" ) - // DW_AT_byte_size( 0x00 ) - // DW_AT_decl_file( "..." ) - // DW_AT_decl_line( 1 ) - // - // Note that there is no DW_AT_declaration and there are no children, - // and the byte size is zero. - return attrs.byte_size && *attrs.byte_size == 0 && attrs.name && - !die.HasChildren() && cu_language == eLanguageTypeObjC; -} - TypeSP DWARFASTParserClang::ParseTypeFromClangModule(const SymbolContext &sc, const DWARFDIE &die, Log *log) { @@ -269,9 +249,11 @@ static void ForcefullyCompleteType(CompilerType type) { /// This function serves a similar purpose as RequireCompleteType above, but it /// avoids completing the type if it is not immediately necessary. It only /// ensures we _can_ complete the type later. -void DWARFASTParserClang::PrepareContextToReceiveMembers( - clang::DeclContext *decl_ctx, const DWARFDIE &decl_ctx_die, - const DWARFDIE &die, const char *type_name_cstr) { +static void PrepareContextToReceiveMembers(TypeSystemClang &ast, + ClangASTImporter &ast_importer, + clang::DeclContext *decl_ctx, + DWARFDIE die, + const char *type_name_cstr) { auto *tag_decl_ctx = clang::dyn_cast<clang::TagDecl>(decl_ctx); if (!tag_decl_ctx) return; // Non-tag context are always ready. @@ -286,8 +268,7 @@ void DWARFASTParserClang::PrepareContextToReceiveMembers( // gmodules case), we can complete the type by doing a full import. // If this type was not imported from an external AST, there's nothing to do. - CompilerType type = m_ast.GetTypeForDecl(tag_decl_ctx); - ClangASTImporter &ast_importer = GetClangASTImporter(); + CompilerType type = ast.GetTypeForDecl(tag_decl_ctx); if (type && ast_importer.CanImport(type)) { auto qual_type = ClangUtil::GetQualType(type); if (ast_importer.RequireCompleteType(qual_type)) @@ -298,13 +279,6 @@ void DWARFASTParserClang::PrepareContextToReceiveMembers( type_name_cstr ? type_name_cstr : "", die.GetOffset()); } - // By searching for the definition DIE of the decl_ctx type, we will either: - // 1. Found the the definition DIE and start its definition with - // TypeSystemClang::StartTagDeclarationDefinition. - // 2. Unable to find it, then need to forcefully complete it. - FindDefinitionTypeForDIE(decl_ctx_die); - if (tag_decl_ctx->isCompleteDefinition() || tag_decl_ctx->isBeingDefined()) - return; // We don't have a type definition and/or the import failed. We must // forcefully complete the type to avoid crashes. ForcefullyCompleteType(type); @@ -572,6 +546,8 @@ TypeSP DWARFASTParserClang::ParseTypeFromDWARF(const SymbolContext &sc, static std::optional<uint32_t> ExtractDataMemberLocation(DWARFDIE const &die, DWARFFormValue const &form_value, ModuleSP module_sp) { + Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); + // With DWARF 3 and later, if the value is an integer constant, // this form value is the offset in bytes from the beginning of // the containing entity. @@ -579,21 +555,23 @@ ExtractDataMemberLocation(DWARFDIE const &die, DWARFFormValue const &form_value, return form_value.Unsigned(); Value initialValue(0); - Value memberOffset(0); const DWARFDataExtractor &debug_info_data = die.GetData(); uint32_t block_length = form_value.Unsigned(); uint32_t block_offset = form_value.BlockData() - debug_info_data.GetDataStart(); - if (!DWARFExpression::Evaluate( - nullptr, // ExecutionContext * - nullptr, // RegisterContext * - module_sp, DataExtractor(debug_info_data, block_offset, block_length), - die.GetCU(), eRegisterKindDWARF, &initialValue, nullptr, memberOffset, - nullptr)) { + + llvm::Expected<Value> memberOffset = DWARFExpression::Evaluate( + /*ExecutionContext=*/nullptr, + /*RegisterContext=*/nullptr, module_sp, + DataExtractor(debug_info_data, block_offset, block_length), die.GetCU(), + eRegisterKindDWARF, &initialValue, nullptr); + if (!memberOffset) { + LLDB_LOG_ERROR(log, memberOffset.takeError(), + "ExtractDataMemberLocation failed: {0}"); return {}; } - return memberOffset.ResolveValue(nullptr).UInt(); + return memberOffset->ResolveValue(nullptr).UInt(); } static TypePayloadClang GetPtrAuthMofidierPayload(const DWARFDIE &die) { @@ -646,11 +624,10 @@ DWARFASTParserClang::ParseTypeModifier(const SymbolContext &sc, if (tag == DW_TAG_typedef) { // DeclContext will be populated when the clang type is materialized in // Type::ResolveCompilerType. - DWARFDIE decl_ctx_die; - clang::DeclContext *decl_ctx = - GetClangDeclContextContainingDIE(die, &decl_ctx_die); - PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die, - attrs.name.GetCString()); + PrepareContextToReceiveMembers( + m_ast, GetClangASTImporter(), + GetClangDeclContextContainingDIE(die, nullptr), die, + attrs.name.GetCString()); if (attrs.type.IsValid()) { // Try to parse a typedef from the (DWARF embedded in the) Clang @@ -1130,6 +1107,32 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, // struct and see if this is actually a C++ method Type *class_type = dwarf->ResolveType(decl_ctx_die); if (class_type) { + if (class_type->GetID() != decl_ctx_die.GetID() || + IsClangModuleFwdDecl(decl_ctx_die)) { + + // We uniqued the parent class of this function to another + // class so we now need to associate all dies under + // "decl_ctx_die" to DIEs in the DIE for "class_type"... + DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID()); + + if (class_type_die) { + std::vector<DWARFDIE> failures; + + CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die, + class_type, failures); + + // FIXME do something with these failures that's + // smarter than just dropping them on the ground. + // Unfortunately classes don't like having stuff added + // to them after their definitions are complete... + + Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()]; + if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) { + return type_ptr->shared_from_this(); + } + } + } + if (attrs.specification.IsValid()) { // We have a specification which we are going to base our // function prototype off of, so we need this type to be @@ -1264,39 +1267,6 @@ DWARFASTParserClang::ParseSubroutine(const DWARFDIE &die, } } } - // By here, we should have already completed the c++ class_type - // because if either specification or abstract_origin is present, we - // call GetClangDeclContextForDIE to resolve the DW_TAG_subprogram - // refered by this one until we reached the DW_TAG_subprogram without - // specification or abstract_origin (the else branch above). Then the - // above GetFullCompilerType() will complete the class_type if it's - // not completed yet. After that, we will have the mapping from DIEs - // in class_type_die to DeclContexts in m_die_to_decl_ctx. - if (class_type->GetID() != decl_ctx_die.GetID() || - IsClangModuleFwdDecl(decl_ctx_die)) { - - // We uniqued the parent class of this function to another - // class so we now need to associate all dies under - // "decl_ctx_die" to DIEs in the DIE for "class_type"... - DWARFDIE class_type_die = dwarf->GetDIE(class_type->GetID()); - - if (class_type_die) { - std::vector<DWARFDIE> failures; - - CopyUniqueClassMethodTypes(decl_ctx_die, class_type_die, - class_type, failures); - - // FIXME do something with these failures that's - // smarter than just dropping them on the ground. - // Unfortunately classes don't like having stuff added - // to them after their definitions are complete... - - Type *type_ptr = dwarf->GetDIEToType()[die.GetDIE()]; - if (type_ptr && type_ptr != DIE_IS_BEING_PARSED) { - return type_ptr->shared_from_this(); - } - } - } } } } @@ -1669,93 +1639,6 @@ DWARFASTParserClang::GetCPlusPlusQualifiedName(const DWARFDIE &die) { return qualified_name; } -lldb_private::Type * -DWARFASTParserClang::FindDefinitionTypeForDIE(const DWARFDIE &die) { - SymbolFileDWARF *dwarf = die.GetDWARF(); - ParsedDWARFTypeAttributes attrs(die); - bool is_forward_declaration = IsForwardDeclaration( - die, attrs, SymbolFileDWARF::GetLanguage(*die.GetCU())); - if (!is_forward_declaration) - return dwarf->GetDIEToType()[die.GetDIE()]; - - const dw_tag_t tag = die.Tag(); - TypeSP type_sp; - Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); - if (log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " - "forward declaration DIE, trying to find definition DIE", - static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag), - attrs.name.GetCString()); - } - // We haven't parse definition die for this type, starting to search for it. - // After we found the definition die, the GetDeclarationDIEToDefinitionDIE() - // map will have the new mapping from this declaration die to definition die. - if (attrs.class_language == eLanguageTypeObjC || - attrs.class_language == eLanguageTypeObjC_plus_plus) { - if (!attrs.is_complete_objc_class && - die.Supports_DW_AT_APPLE_objc_complete_type()) { - // We have a valid eSymbolTypeObjCClass class symbol whose name - // matches the current objective C class that we are trying to find - // and this DIE isn't the complete definition (we checked - // is_complete_objc_class above and know it is false), so the real - // definition is in here somewhere - type_sp = - dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true); - - if (!type_sp) { - SymbolFileDWARFDebugMap *debug_map_symfile = - dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, - // see if we have a declaration anywhere else... - type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( - die, attrs.name, true); - } - } - - if (type_sp && log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " - "incomplete objc type, complete type is {5:x8}", - static_cast<void *>(this), die.GetOffset(), - DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), - type_sp->GetID()); - } - } - } - - type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die); - if (!type_sp) { - SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); - if (debug_map_symfile) { - // We weren't able to find a full declaration in this DWARF, see - // if we have a declaration anywhere else... - type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die); - } - if (type_sp && log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " - "forward declaration, complete type is {4:x8}", - static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag), - attrs.name.GetCString(), type_sp->GetID()); - } - } - - if (!type_sp && log) { - dwarf->GetObjectFile()->GetModule()->LogMessage( - log, - "SymbolFileDWARF({0:p}) - {1:x16}: {2} type \"{3}\" is a " - "forward declaration, unable to find definition DIE for it", - static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag), - attrs.name.GetCString()); - } - return type_sp.get(); -} - TypeSP DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, const DWARFDIE &die, @@ -1767,10 +1650,14 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, LanguageType cu_language = SymbolFileDWARF::GetLanguage(*die.GetCU()); Log *log = GetLog(DWARFLog::TypeCompletion | DWARFLog::Lookups); + // UniqueDWARFASTType is large, so don't create a local variables on the + // stack, put it on the heap. This function is often called recursively and + // clang isn't good at sharing the stack space for variables in different + // blocks. + auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>(); + ConstString unique_typename(attrs.name); Declaration unique_decl(attrs.decl); - uint64_t byte_size = attrs.byte_size.value_or(0); - attrs.is_forward_declaration = IsForwardDeclaration(die, attrs, cu_language); if (attrs.name) { if (Language::LanguageIsCPlusPlus(cu_language)) { @@ -1783,42 +1670,14 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, unique_decl.Clear(); } - if (UniqueDWARFASTType *unique_ast_entry_type = - dwarf->GetUniqueDWARFASTTypeMap().Find( - unique_typename, die, unique_decl, byte_size, - attrs.is_forward_declaration)) { - type_sp = unique_ast_entry_type->m_type_sp; + if (dwarf->GetUniqueDWARFASTTypeMap().Find( + unique_typename, die, unique_decl, attrs.byte_size.value_or(-1), + *unique_ast_entry_up)) { + type_sp = unique_ast_entry_up->m_type_sp; if (type_sp) { dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); LinkDeclContextToDIE( - GetCachedClangDeclContextForDIE(unique_ast_entry_type->m_die), die); - if (!attrs.is_forward_declaration) { - // If the DIE being parsed in this function is a definition and the - // entry in the map is a declaration, then we need to update the entry - // to point to the definition DIE. - if (unique_ast_entry_type->m_is_forward_declaration) { - unique_ast_entry_type->m_die = die; - unique_ast_entry_type->m_byte_size = byte_size; - unique_ast_entry_type->m_declaration = unique_decl; - unique_ast_entry_type->m_is_forward_declaration = false; - // Need to update Type ID to refer to the definition DIE. because - // it's used in ParseSubroutine to determine if we need to copy cxx - // method types from a declaration DIE to this definition DIE. - type_sp->SetID(die.GetID()); - clang_type = type_sp->GetForwardCompilerType(); - if (attrs.class_language != eLanguageTypeObjC && - attrs.class_language != eLanguageTypeObjC_plus_plus) - TypeSystemClang::StartTagDeclarationDefinition(clang_type); - - CompilerType compiler_type_no_qualifiers = - ClangUtil::RemoveFastQualifiers(clang_type); - auto result = dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( - compiler_type_no_qualifiers.GetOpaqueQualType(), - *die.GetDIERef()); - if (!result.second) - result.first->second = *die.GetDIERef(); - } - } + GetCachedClangDeclContextForDIE(unique_ast_entry_up->m_die), die); return type_sp; } } @@ -1840,21 +1699,125 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, default_accessibility = eAccessPrivate; } + if (attrs.byte_size && *attrs.byte_size == 0 && attrs.name && + !die.HasChildren() && cu_language == eLanguageTypeObjC) { + // Work around an issue with clang at the moment where forward + // declarations for objective C classes are emitted as: + // DW_TAG_structure_type [2] + // DW_AT_name( "ForwardObjcClass" ) + // DW_AT_byte_size( 0x00 ) + // DW_AT_decl_file( "..." ) + // DW_AT_decl_line( 1 ) + // + // Note that there is no DW_AT_declaration and there are no children, + // and the byte size is zero. + attrs.is_forward_declaration = true; + } + + if (attrs.class_language == eLanguageTypeObjC || + attrs.class_language == eLanguageTypeObjC_plus_plus) { + if (!attrs.is_complete_objc_class && + die.Supports_DW_AT_APPLE_objc_complete_type()) { + // We have a valid eSymbolTypeObjCClass class symbol whose name + // matches the current objective C class that we are trying to find + // and this DIE isn't the complete definition (we checked + // is_complete_objc_class above and know it is false), so the real + // definition is in here somewhere + type_sp = + dwarf->FindCompleteObjCDefinitionTypeForDIE(die, attrs.name, true); + + if (!type_sp) { + SymbolFileDWARFDebugMap *debug_map_symfile = + dwarf->GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, + // see if we have a declaration anywhere else... + type_sp = debug_map_symfile->FindCompleteObjCDefinitionTypeForDIE( + die, attrs.name, true); + } + } + + if (type_sp) { + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is an " + "incomplete objc type, complete type is {5:x8}", + static_cast<void *>(this), die.GetOffset(), + DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), + type_sp->GetID()); + } + + // We found a real definition for this type elsewhere so lets use + // it and cache the fact that we found a complete type for this + // die + dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); + return type_sp; + } + } + } + if (attrs.is_forward_declaration) { + // We have a forward declaration to a type and we need to try and + // find a full declaration. We look in the current type index just in + // case we have a forward declaration followed by an actual + // declarations in the DWARF. If this fails, we need to look + // elsewhere... + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " + "forward declaration, trying to find complete type", + static_cast<void *>(this), die.GetOffset(), DW_TAG_value_to_name(tag), + tag, attrs.name.GetCString()); + } + // See if the type comes from a Clang module and if so, track down // that type. type_sp = ParseTypeFromClangModule(sc, die, log); if (type_sp) return type_sp; - } + // type_sp = FindDefinitionTypeForDIE (dwarf_cu, die, + // type_name_const_str); + type_sp = dwarf->FindDefinitionTypeForDWARFDeclContext(die); + + if (!type_sp) { + SymbolFileDWARFDebugMap *debug_map_symfile = dwarf->GetDebugMapSymfile(); + if (debug_map_symfile) { + // We weren't able to find a full declaration in this DWARF, see + // if we have a declaration anywhere else... + type_sp = debug_map_symfile->FindDefinitionTypeForDWARFDeclContext(die); + } + } + + if (type_sp) { + if (log) { + dwarf->GetObjectFile()->GetModule()->LogMessage( + log, + "SymbolFileDWARF({0:p}) - {1:x16}: {2} ({3}) type \"{4}\" is a " + "forward declaration, complete type is {5:x8}", + static_cast<void *>(this), die.GetOffset(), + DW_TAG_value_to_name(tag), tag, attrs.name.GetCString(), + type_sp->GetID()); + } + + // We found a real definition for this type elsewhere so lets use + // it and cache the fact that we found a complete type for this die + dwarf->GetDIEToType()[die.GetDIE()] = type_sp.get(); + clang::DeclContext *defn_decl_ctx = + GetCachedClangDeclContextForDIE(dwarf->GetDIE(type_sp->GetID())); + if (defn_decl_ctx) + LinkDeclContextToDIE(defn_decl_ctx, die); + return type_sp; + } + } assert(tag_decl_kind != -1); UNUSED_IF_ASSERT_DISABLED(tag_decl_kind); - DWARFDIE decl_ctx_die; - clang::DeclContext *decl_ctx = - GetClangDeclContextContainingDIE(die, &decl_ctx_die); + bool clang_type_was_created = false; + clang::DeclContext *decl_ctx = GetClangDeclContextContainingDIE(die, nullptr); - PrepareContextToReceiveMembers(decl_ctx, decl_ctx_die, die, + PrepareContextToReceiveMembers(m_ast, GetClangASTImporter(), decl_ctx, die, attrs.name.GetCString()); if (attrs.accessibility == eAccessNone && decl_ctx) { @@ -1893,17 +1856,20 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, tag_decl_kind, template_param_infos); clang_type = m_ast.CreateClassTemplateSpecializationType(class_specialization_decl); + clang_type_was_created = true; m_ast.SetMetadata(class_template_decl, metadata); m_ast.SetMetadata(class_specialization_decl, metadata); } - if (!clang_type) { + if (!clang_type_was_created) { + clang_type_was_created = true; clang_type = m_ast.CreateRecordType( decl_ctx, GetOwningClangModule(die), attrs.accessibility, attrs.name.GetCString(), tag_decl_kind, attrs.class_language, &metadata, attrs.exports_symbols); } + // Store a forward declaration to this class type in case any // parameters in any class methods need it for the clang types for // function prototypes. @@ -1914,19 +1880,13 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, Type::ResolveState::Forward, TypePayloadClang(OptionalClangModuleID(), attrs.is_complete_objc_class)); - // UniqueDWARFASTType is large, so don't create a local variables on the - // stack, put it on the heap. This function is often called recursively and - // clang isn't good at sharing the stack space for variables in different - // blocks. - auto unique_ast_entry_up = std::make_unique<UniqueDWARFASTType>(); // Add our type to the unique type map so we don't end up creating many // copies of the same type over and over in the ASTContext for our // module unique_ast_entry_up->m_type_sp = type_sp; unique_ast_entry_up->m_die = die; unique_ast_entry_up->m_declaration = unique_decl; - unique_ast_entry_up->m_byte_size = byte_size; - unique_ast_entry_up->m_is_forward_declaration = attrs.is_forward_declaration; + unique_ast_entry_up->m_byte_size = attrs.byte_size.value_or(0); dwarf->GetUniqueDWARFASTTypeMap().Insert(unique_typename, *unique_ast_entry_up); @@ -1967,7 +1927,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, GetClangASTImporter().SetRecordLayout(record_decl, layout); } } - } else { + } else if (clang_type_was_created) { // Start the definition if the class is not objective C since the // underlying decls respond to isCompleteDefinition(). Objective // C decls don't respond to isCompleteDefinition() so we can't @@ -1979,21 +1939,26 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc, if (attrs.class_language != eLanguageTypeObjC && attrs.class_language != eLanguageTypeObjC_plus_plus) TypeSystemClang::StartTagDeclarationDefinition(clang_type); + + // Leave this as a forward declaration until we need to know the + // details of the type. lldb_private::Type will automatically call + // the SymbolFile virtual function + // "SymbolFileDWARF::CompleteType(Type *)" When the definition + // needs to be defined. + assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count( + ClangUtil::RemoveFastQualifiers(clang_type) + .GetOpaqueQualType()) && + "Type already in the forward declaration map!"); + // Can't assume m_ast.GetSymbolFile() is actually a + // SymbolFileDWARF, it can be a SymbolFileDWARFDebugMap for Apple + // binaries. + dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( + ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), + *die.GetDIERef()); + m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); } } - // If this is a declaration DIE, leave this as a forward declaration until we - // need to know the details of the type. lldb_private::Type will automatically - // call the SymbolFile virtual function "SymbolFileDWARF::CompleteType(Type - // *)" When the definition needs to be defined. - assert(!dwarf->GetForwardDeclCompilerTypeToDIE().count( - ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType()) && - "Type already in the forward declaration map!"); - dwarf->GetForwardDeclCompilerTypeToDIE().try_emplace( - ClangUtil::RemoveFastQualifiers(clang_type).GetOpaqueQualType(), - *die.GetDIERef()); - m_ast.SetHasExternalStorage(clang_type.GetOpaqueQualType(), true); - // If we made a clang type, set the trivial abi if applicable: We only // do this for pass by value - which implies the Trivial ABI. There // isn't a way to assert that something that would normally be pass by @@ -2232,10 +2197,6 @@ bool DWARFASTParserClang::CompleteRecordType(const DWARFDIE &die, // For objective C we don't start the definition when the class is // created. TypeSystemClang::StartTagDeclarationDefinition(clang_type); - } else { - assert(clang_type.IsBeingDefined() && - "Trying to complete a definition without a prior call to " - "StartTagDeclarationDefinition."); } AccessType default_accessibility = eAccessNone; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h index 853b8cc..8d4af20 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.h @@ -42,40 +42,40 @@ struct ParsedDWARFTypeAttributes; class DWARFASTParserClang : public lldb_private::plugin::dwarf::DWARFASTParser { public: - typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE; - DWARFASTParserClang(lldb_private::TypeSystemClang &ast); ~DWARFASTParserClang() override; // DWARFASTParser interface. - lldb::TypeSP ParseTypeFromDWARF(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, - bool *type_is_new_ptr) override; + lldb::TypeSP + ParseTypeFromDWARF(const lldb_private::SymbolContext &sc, + const lldb_private::plugin::dwarf::DWARFDIE &die, + bool *type_is_new_ptr) override; - lldb_private::ConstString - ConstructDemangledNameFromDWARF(const DWARFDIE &die) override; + lldb_private::ConstString ConstructDemangledNameFromDWARF( + const lldb_private::plugin::dwarf::DWARFDIE &die) override; lldb_private::Function * ParseFunctionFromDWARF(lldb_private::CompileUnit &comp_unit, - const DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &die, const lldb_private::AddressRange &func_range) override; bool - CompleteTypeFromDWARF(const DWARFDIE &die, lldb_private::Type *type, + CompleteTypeFromDWARF(const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::Type *type, lldb_private::CompilerType &compiler_type) override; - lldb_private::CompilerDecl - GetDeclForUIDFromDWARF(const DWARFDIE &die) override; + lldb_private::CompilerDecl GetDeclForUIDFromDWARF( + const lldb_private::plugin::dwarf::DWARFDIE &die) override; void EnsureAllDIEsInDeclContextHaveBeenParsed( lldb_private::CompilerDeclContext decl_context) override; - lldb_private::CompilerDeclContext - GetDeclContextForUIDFromDWARF(const DWARFDIE &die) override; + lldb_private::CompilerDeclContext GetDeclContextForUIDFromDWARF( + const lldb_private::plugin::dwarf::DWARFDIE &die) override; - lldb_private::CompilerDeclContext - GetDeclContextContainingUIDFromDWARF(const DWARFDIE &die) override; + lldb_private::CompilerDeclContext GetDeclContextContainingUIDFromDWARF( + const lldb_private::plugin::dwarf::DWARFDIE &die) override; lldb_private::ClangASTImporter &GetClangASTImporter(); @@ -105,13 +105,8 @@ public: /// \return A string, including surrounding '<>', of the template parameters. /// If the DIE's name already has '<>', returns an empty ConstString because /// it's assumed that the caller is using the DIE name anyway. - lldb_private::ConstString - GetDIEClassTemplateParams(const DWARFDIE &die) override; - - // Searching for definition DIE for the given DIE and return the type - // associated with the definition DIE, or nullptr if definition DIE is not - // found. - lldb_private::Type *FindDefinitionTypeForDIE(const DWARFDIE &die) override; + lldb_private::ConstString GetDIEClassTemplateParams( + const lldb_private::plugin::dwarf::DWARFDIE &die) override; protected: /// Protected typedefs and members. @@ -123,7 +118,8 @@ protected: const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *, clang::DeclContext *> DIEToDeclContextMap; - typedef std::multimap<const clang::DeclContext *, const DWARFDIE> + typedef std::multimap<const clang::DeclContext *, + const lldb_private::plugin::dwarf::DWARFDIE> DeclContextToDIEMap; typedef llvm::DenseMap< const lldb_private::plugin::dwarf::DWARFDebugInfoEntry *, @@ -141,11 +137,14 @@ protected: std::unique_ptr<lldb_private::ClangASTImporter> m_clang_ast_importer_up; /// @} - clang::DeclContext *GetDeclContextForBlock(const DWARFDIE &die); + clang::DeclContext * + GetDeclContextForBlock(const lldb_private::plugin::dwarf::DWARFDIE &die); - clang::BlockDecl *ResolveBlockDIE(const DWARFDIE &die); + clang::BlockDecl * + ResolveBlockDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); - clang::NamespaceDecl *ResolveNamespaceDIE(const DWARFDIE &die); + clang::NamespaceDecl * + ResolveNamespaceDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); /// Returns the namespace decl that a DW_TAG_imported_declaration imports. /// @@ -156,86 +155,96 @@ protected: /// 'die' imports. If the imported entity is not a namespace /// or another import declaration, returns nullptr. If an error /// occurs, returns nullptr. - clang::NamespaceDecl *ResolveImportedDeclarationDIE(const DWARFDIE &die); + clang::NamespaceDecl *ResolveImportedDeclarationDIE( + const lldb_private::plugin::dwarf::DWARFDIE &die); - bool ParseTemplateDIE(const DWARFDIE &die, + bool ParseTemplateDIE(const lldb_private::plugin::dwarf::DWARFDIE &die, lldb_private::TypeSystemClang::TemplateParameterInfos &template_param_infos); bool ParseTemplateParameterInfos( - const DWARFDIE &parent_die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, lldb_private::TypeSystemClang::TemplateParameterInfos &template_param_infos); - std::string GetCPlusPlusQualifiedName(const DWARFDIE &die); + std::string + GetCPlusPlusQualifiedName(const lldb_private::plugin::dwarf::DWARFDIE &die); bool ParseChildMembers( - const DWARFDIE &die, lldb_private::CompilerType &class_compiler_type, + const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::CompilerType &class_compiler_type, std::vector<std::unique_ptr<clang::CXXBaseSpecifier>> &base_classes, - std::vector<DWARFDIE> &member_function_dies, - std::vector<DWARFDIE> &contained_type_dies, + std::vector<lldb_private::plugin::dwarf::DWARFDIE> &member_function_dies, + std::vector<lldb_private::plugin::dwarf::DWARFDIE> &contained_type_dies, DelayedPropertyList &delayed_properties, const lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); size_t ParseChildParameters(clang::DeclContext *containing_decl_ctx, - const DWARFDIE &parent_die, bool skip_artificial, - bool &is_static, bool &is_variadic, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + bool skip_artificial, bool &is_static, bool &is_variadic, bool &has_template_params, std::vector<lldb_private::CompilerType> &function_args, std::vector<clang::ParmVarDecl *> &function_param_decls, unsigned &type_quals); - size_t ParseChildEnumerators(lldb_private::CompilerType &compiler_type, - bool is_signed, uint32_t enumerator_byte_size, - const DWARFDIE &parent_die); + size_t ParseChildEnumerators( + lldb_private::CompilerType &compiler_type, bool is_signed, + uint32_t enumerator_byte_size, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die); /// Parse a structure, class, or union type DIE. - lldb::TypeSP ParseStructureLikeDIE(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, - ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP + ParseStructureLikeDIE(const lldb_private::SymbolContext &sc, + const lldb_private::plugin::dwarf::DWARFDIE &die, + ParsedDWARFTypeAttributes &attrs); - clang::Decl *GetClangDeclForDIE(const DWARFDIE &die); + clang::Decl * + GetClangDeclForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); - clang::DeclContext *GetClangDeclContextForDIE(const DWARFDIE &die); + clang::DeclContext * + GetClangDeclContextForDIE(const lldb_private::plugin::dwarf::DWARFDIE &die); - clang::DeclContext *GetClangDeclContextContainingDIE(const DWARFDIE &die, - DWARFDIE *decl_ctx_die); - lldb_private::OptionalClangModuleID GetOwningClangModule(const DWARFDIE &die); + clang::DeclContext *GetClangDeclContextContainingDIE( + const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::plugin::dwarf::DWARFDIE *decl_ctx_die); + lldb_private::OptionalClangModuleID + GetOwningClangModule(const lldb_private::plugin::dwarf::DWARFDIE &die); - bool CopyUniqueClassMethodTypes(const DWARFDIE &src_class_die, - const DWARFDIE &dst_class_die, - lldb_private::Type *class_type, - std::vector<DWARFDIE> &failures); + bool CopyUniqueClassMethodTypes( + const lldb_private::plugin::dwarf::DWARFDIE &src_class_die, + const lldb_private::plugin::dwarf::DWARFDIE &dst_class_die, + lldb_private::Type *class_type, + std::vector<lldb_private::plugin::dwarf::DWARFDIE> &failures); - clang::DeclContext *GetCachedClangDeclContextForDIE(const DWARFDIE &die); + clang::DeclContext *GetCachedClangDeclContextForDIE( + const lldb_private::plugin::dwarf::DWARFDIE &die); - void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, const DWARFDIE &die); + void LinkDeclContextToDIE(clang::DeclContext *decl_ctx, + const lldb_private::plugin::dwarf::DWARFDIE &die); - void LinkDeclToDIE(clang::Decl *decl, const DWARFDIE &die); + void LinkDeclToDIE(clang::Decl *decl, + const lldb_private::plugin::dwarf::DWARFDIE &die); /// If \p type_sp is valid, calculate and set its symbol context scope, and /// update the type list for its backing symbol file. /// /// Returns \p type_sp. - lldb::TypeSP - UpdateSymbolContextScopeForType(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, lldb::TypeSP type_sp); + lldb::TypeSP UpdateSymbolContextScopeForType( + const lldb_private::SymbolContext &sc, + const lldb_private::plugin::dwarf::DWARFDIE &die, lldb::TypeSP type_sp); /// Follow Clang Module Skeleton CU references to find a type definition. - lldb::TypeSP ParseTypeFromClangModule(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, - lldb_private::Log *log); + lldb::TypeSP + ParseTypeFromClangModule(const lldb_private::SymbolContext &sc, + const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::Log *log); // Return true if this type is a declaration to a type in an external // module. - lldb::ModuleSP GetModuleForType(const DWARFDIE &die); - - void PrepareContextToReceiveMembers(clang::DeclContext *decl_ctx, - const DWARFDIE &decl_ctx_die, - const DWARFDIE &die, - const char *type_name_cstr); + lldb::ModuleSP + GetModuleForType(const lldb_private::plugin::dwarf::DWARFDIE &die); static bool classof(const DWARFASTParser *Parser) { return Parser->GetKind() == Kind::DWARFASTParserClang; @@ -265,8 +274,10 @@ private: /// Parsed form of all attributes that are relevant for parsing type members. struct MemberAttributes { - explicit MemberAttributes(const DWARFDIE &die, const DWARFDIE &parent_die, - lldb::ModuleSP module_sp); + explicit MemberAttributes( + const lldb_private::plugin::dwarf::DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + lldb::ModuleSP module_sp); const char *name = nullptr; /// Indicates how many bits into the word (according to the host endianness) /// the low-order bit of the field starts. Can be negative. @@ -313,12 +324,15 @@ private: /// created property. /// \param delayed_properties The list of delayed properties that the result /// will be appended to. - void ParseObjCProperty(const DWARFDIE &die, const DWARFDIE &parent_die, - const lldb_private::CompilerType &class_clang_type, - DelayedPropertyList &delayed_properties); + void + ParseObjCProperty(const lldb_private::plugin::dwarf::DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, + const lldb_private::CompilerType &class_clang_type, + DelayedPropertyList &delayed_properties); void - ParseSingleMember(const DWARFDIE &die, const DWARFDIE &parent_die, + ParseSingleMember(const lldb_private::plugin::dwarf::DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, const lldb_private::CompilerType &class_clang_type, lldb::AccessType default_accessibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info, @@ -336,25 +350,31 @@ private: /// \param[in] class_clang_type The parent RecordType of the static /// member this function will create. void CreateStaticMemberVariable( - const DWARFDIE &die, const MemberAttributes &attrs, + const lldb_private::plugin::dwarf::DWARFDIE &die, + const MemberAttributes &attrs, const lldb_private::CompilerType &class_clang_type); - bool CompleteRecordType(const DWARFDIE &die, lldb_private::Type *type, + bool CompleteRecordType(const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::Type *type, lldb_private::CompilerType &clang_type); - bool CompleteEnumType(const DWARFDIE &die, lldb_private::Type *type, + bool CompleteEnumType(const lldb_private::plugin::dwarf::DWARFDIE &die, + lldb_private::Type *type, lldb_private::CompilerType &clang_type); - lldb::TypeSP ParseTypeModifier(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, - ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP + ParseTypeModifier(const lldb_private::SymbolContext &sc, + const lldb_private::plugin::dwarf::DWARFDIE &die, + ParsedDWARFTypeAttributes &attrs); lldb::TypeSP ParseEnum(const lldb_private::SymbolContext &sc, - const DWARFDIE &die, ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP ParseSubroutine(const DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &die, + ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP ParseSubroutine(const lldb_private::plugin::dwarf::DWARFDIE &die, const ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP ParseArrayType(const DWARFDIE &die, + lldb::TypeSP ParseArrayType(const lldb_private::plugin::dwarf::DWARFDIE &die, const ParsedDWARFTypeAttributes &attrs); - lldb::TypeSP ParsePointerToMemberType(const DWARFDIE &die, - const ParsedDWARFTypeAttributes &attrs); + lldb::TypeSP + ParsePointerToMemberType(const lldb_private::plugin::dwarf::DWARFDIE &die, + const ParsedDWARFTypeAttributes &attrs); /// Parses a DW_TAG_inheritance DIE into a base/super class. /// @@ -371,7 +391,8 @@ private: /// \param layout_info The layout information that will be updated for C++ /// base classes with the base offset. void ParseInheritance( - const DWARFDIE &die, const DWARFDIE &parent_die, + const lldb_private::plugin::dwarf::DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, const lldb_private::CompilerType class_clang_type, const lldb::AccessType default_accessibility, const lldb::ModuleSP &module_sp, @@ -388,7 +409,8 @@ private: /// \param layout_info The layout information that will be updated for // base classes with the base offset void - ParseRustVariantPart(DWARFDIE &die, const DWARFDIE &parent_die, + ParseRustVariantPart(lldb_private::plugin::dwarf::DWARFDIE &die, + const lldb_private::plugin::dwarf::DWARFDIE &parent_die, lldb_private::CompilerType &class_clang_type, const lldb::AccessType default_accesibility, lldb_private::ClangASTImporter::LayoutInfo &layout_info); @@ -398,9 +420,8 @@ private: /// Some attributes are relevant for all kinds of types (declaration), while /// others are only meaningful to a specific type (is_virtual) struct ParsedDWARFTypeAttributes { - typedef lldb_private::plugin::dwarf::DWARFDIE DWARFDIE; - - explicit ParsedDWARFTypeAttributes(const DWARFDIE &die); + explicit ParsedDWARFTypeAttributes( + const lldb_private::plugin::dwarf::DWARFDIE &die); lldb::AccessType accessibility = lldb::eAccessNone; bool is_artificial = false; @@ -417,7 +438,7 @@ struct ParsedDWARFTypeAttributes { const char *mangled_name = nullptr; lldb_private::ConstString name; lldb_private::Declaration decl; - DWARFDIE object_pointer; + lldb_private::plugin::dwarf::DWARFDIE object_pointer; lldb_private::plugin::dwarf::DWARFFormValue abstract_origin; lldb_private::plugin::dwarf::DWARFFormValue containing_type; lldb_private::plugin::dwarf::DWARFFormValue signature; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp index 6330470..90e42be 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp @@ -85,11 +85,6 @@ bool DebugNamesDWARFIndex::ProcessEntry( DWARFDIE die = GetDIE(entry); if (!die) return true; - // Clang erroneously emits index entries for declaration DIEs in case when the - // definition is in a type unit (llvm.org/pr77696). Weed those out. - if (die.IsStructUnionOrClass() && - die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0)) - return true; return callback(die); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 661e4a7..a52a7d6 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -481,13 +481,6 @@ static ConstString GetDWARFMachOSegmentName() { return g_dwarf_section_name; } -llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> & -SymbolFileDWARF::GetForwardDeclCompilerTypeToDIE() { - if (SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile()) - return debug_map_symfile->GetForwardDeclCompilerTypeToDIE(); - return m_forward_decl_compiler_type_to_die; -} - UniqueDWARFASTTypeMap &SymbolFileDWARF::GetUniqueDWARFASTTypeMap() { SymbolFileDWARFDebugMap *debug_map_symfile = GetDebugMapSymfile(); if (debug_map_symfile) @@ -1639,33 +1632,27 @@ bool SymbolFileDWARF::CompleteType(CompilerType &compiler_type) { return true; } - // Once we start resolving this type, remove it from the forward - // declaration map in case anyone's child members or other types require this - // type to get resolved. - DWARFDIE dwarf_die = GetDIE(die_it->second); - GetForwardDeclCompilerTypeToDIE().erase(die_it); - Type *type = nullptr; - if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) - type = dwarf_ast->FindDefinitionTypeForDIE(dwarf_die); - if (!type) - return false; - - die_it = GetForwardDeclCompilerTypeToDIE().find( - compiler_type_no_qualifiers.GetOpaqueQualType()); - if (die_it != GetForwardDeclCompilerTypeToDIE().end()) { - dwarf_die = GetDIE(die_it->getSecond()); + DWARFDIE dwarf_die = GetDIE(die_it->getSecond()); + if (dwarf_die) { + // Once we start resolving this type, remove it from the forward + // declaration map in case anyone child members or other types require this + // type to get resolved. The type will get resolved when all of the calls + // to SymbolFileDWARF::ResolveClangOpaqueTypeDefinition are done. GetForwardDeclCompilerTypeToDIE().erase(die_it); - } - if (Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion)) - GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( - log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", - dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()), - dwarf_die.Tag(), type->GetName().AsCString()); - assert(compiler_type); - if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) - return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type); - return true; + Type *type = GetDIEToType().lookup(dwarf_die.GetDIE()); + + Log *log = GetLog(DWARFLog::DebugInfo | DWARFLog::TypeCompletion); + if (log) + GetObjectFile()->GetModule()->LogMessageVerboseBacktrace( + log, "{0:x8}: {1} ({2}) '{3}' resolving forward declaration...", + dwarf_die.GetID(), DW_TAG_value_to_name(dwarf_die.Tag()), + dwarf_die.Tag(), type->GetName().AsCString()); + assert(compiler_type); + if (DWARFASTParser *dwarf_ast = GetDWARFParser(*dwarf_die.GetCU())) + return dwarf_ast->CompleteTypeFromDWARF(dwarf_die, type, compiler_type); + } + return false; } Type *SymbolFileDWARF::ResolveType(const DWARFDIE &die, @@ -2105,16 +2092,14 @@ SymbolFileDWARF::GlobalVariableMap &SymbolFileDWARF::GetGlobalAranges() { if (var_sp && !var_sp->GetLocationIsConstantValueData()) { const DWARFExpressionList &location = var_sp->LocationExpressionList(); - Value location_result; - Status error; ExecutionContext exe_ctx; - if (location.Evaluate(&exe_ctx, nullptr, LLDB_INVALID_ADDRESS, - nullptr, nullptr, location_result, - &error)) { - if (location_result.GetValueType() == + llvm::Expected<Value> location_result = location.Evaluate( + &exe_ctx, nullptr, LLDB_INVALID_ADDRESS, nullptr, nullptr); + if (location_result) { + if (location_result->GetValueType() == Value::ValueType::FileAddress) { lldb::addr_t file_addr = - location_result.GetScalar().ULongLong(); + location_result->GetScalar().ULongLong(); lldb::addr_t byte_size = 1; if (var_sp->GetType()) byte_size = @@ -2122,6 +2107,10 @@ SymbolFileDWARF::GlobalVariableMap &SymbolFileDWARF::GetGlobalAranges() { m_global_aranges_up->Append(GlobalVariableMap::Entry( file_addr, byte_size, var_sp.get())); } + } else { + LLDB_LOG_ERROR(GetLog(LLDBLog::Symbols), + location_result.takeError(), + "location expression failed to execute: {0}"); } } } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h index 35893f2..7282c08 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h @@ -335,8 +335,12 @@ public: virtual DIEToTypePtr &GetDIEToType() { return m_die_to_type; } - virtual llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> & - GetForwardDeclCompilerTypeToDIE(); + typedef llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> + CompilerTypeToDIE; + + virtual CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() { + return m_forward_decl_compiler_type_to_die; + } typedef llvm::DenseMap<const DWARFDebugInfoEntry *, lldb::VariableSP> DIEToVariableSP; @@ -529,14 +533,9 @@ protected: NameToOffsetMap m_function_scope_qualified_name_map; std::unique_ptr<DWARFDebugRanges> m_ranges; UniqueDWARFASTTypeMap m_unique_ast_type_map; - // A map from DIE to lldb_private::Type. For record type, the key might be - // either declaration DIE or definition DIE. DIEToTypePtr m_die_to_type; DIEToVariableSP m_die_to_variable_sp; - // A map from CompilerType to the struct/class/union/enum DIE (might be a - // declaration or a definition) that is used to construct it. - llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> - m_forward_decl_compiler_type_to_die; + CompilerTypeToDIE m_forward_decl_compiler_type_to_die; llvm::DenseMap<dw_offset_t, std::unique_ptr<SupportFileList>> m_type_unit_support_files; std::vector<uint32_t> m_lldb_cu_to_dwarf_unit; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h index d7d5719..de22dd6 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDebugMap.h @@ -284,11 +284,6 @@ protected: lldb::TypeSP FindCompleteObjCDefinitionTypeForDIE( const DWARFDIE &die, ConstString type_name, bool must_be_implementation); - llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> & - GetForwardDeclCompilerTypeToDIE() { - return m_forward_decl_compiler_type_to_die; - } - UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() { return m_unique_ast_type_map; } @@ -326,10 +321,6 @@ protected: std::vector<uint32_t> m_func_indexes; // Sorted by address std::vector<uint32_t> m_glob_indexes; std::map<std::pair<ConstString, llvm::sys::TimePoint<>>, OSOInfoSP> m_oso_map; - // A map from CompilerType to the struct/class/union/enum DIE (might be a - // declaration or a definition) that is used to construct it. - llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> - m_forward_decl_compiler_type_to_die; UniqueDWARFASTTypeMap m_unique_ast_type_map; LazyBool m_supports_DW_AT_APPLE_objc_complete_type; DebugMap m_debug_map; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp index e4db39c..71c9997 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp @@ -110,7 +110,7 @@ SymbolFileDWARF::DIEToVariableSP &SymbolFileDWARFDwo::GetDIEToVariable() { return GetBaseSymbolFile().GetDIEToVariable(); } -llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> & +SymbolFileDWARF::CompilerTypeToDIE & SymbolFileDWARFDwo::GetForwardDeclCompilerTypeToDIE() { return GetBaseSymbolFile().GetForwardDeclCompilerTypeToDIE(); } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h index 2f0ac41..1500540 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h @@ -72,8 +72,7 @@ protected: DIEToVariableSP &GetDIEToVariable() override; - llvm::DenseMap<lldb::opaque_compiler_type_t, DIERef> & - GetForwardDeclCompilerTypeToDIE() override; + CompilerTypeToDIE &GetForwardDeclCompilerTypeToDIE() override; UniqueDWARFASTTypeMap &GetUniqueDWARFASTTypeMap() override; diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp index 3d201e9..223518f 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.cpp @@ -13,75 +13,66 @@ using namespace lldb_private::dwarf; using namespace lldb_private::plugin::dwarf; -static bool IsStructOrClassTag(llvm::dwarf::Tag Tag) { - return Tag == llvm::dwarf::Tag::DW_TAG_class_type || - Tag == llvm::dwarf::Tag::DW_TAG_structure_type; -} - -UniqueDWARFASTType *UniqueDWARFASTTypeList::Find( - const DWARFDIE &die, const lldb_private::Declaration &decl, - const int32_t byte_size, bool is_forward_declaration) { - for (UniqueDWARFASTType &udt : m_collection) { +bool UniqueDWARFASTTypeList::Find(const DWARFDIE &die, + const lldb_private::Declaration &decl, + const int32_t byte_size, + UniqueDWARFASTType &entry) const { + for (const UniqueDWARFASTType &udt : m_collection) { // Make sure the tags match - if (udt.m_die.Tag() == die.Tag() || (IsStructOrClassTag(udt.m_die.Tag()) && - IsStructOrClassTag(die.Tag()))) { - // If they are not both definition DIEs or both declaration DIEs, then - // don't check for byte size and declaration location, because declaration - // DIEs usually don't have those info. - bool matching_size_declaration = - udt.m_is_forward_declaration != is_forward_declaration - ? true - : (udt.m_byte_size < 0 || byte_size < 0 || - udt.m_byte_size == byte_size) && - udt.m_declaration == decl; - if (!matching_size_declaration) - continue; - // The type has the same name, and was defined on the same file and - // line. Now verify all of the parent DIEs match. - DWARFDIE parent_arg_die = die.GetParent(); - DWARFDIE parent_pos_die = udt.m_die.GetParent(); - bool match = true; - bool done = false; - while (!done && match && parent_arg_die && parent_pos_die) { - const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); - const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); - if (parent_arg_tag == parent_pos_tag || - (IsStructOrClassTag(parent_arg_tag) && - IsStructOrClassTag(parent_pos_tag))) { - switch (parent_arg_tag) { - case DW_TAG_class_type: - case DW_TAG_structure_type: - case DW_TAG_union_type: - case DW_TAG_namespace: { - const char *parent_arg_die_name = parent_arg_die.GetName(); - if (parent_arg_die_name == nullptr) { - // Anonymous (i.e. no-name) struct - match = false; - } else { - const char *parent_pos_die_name = parent_pos_die.GetName(); - if (parent_pos_die_name == nullptr || - ((parent_arg_die_name != parent_pos_die_name) && - strcmp(parent_arg_die_name, parent_pos_die_name))) - match = false; + if (udt.m_die.Tag() == die.Tag()) { + // Validate byte sizes of both types only if both are valid. + if (udt.m_byte_size < 0 || byte_size < 0 || + udt.m_byte_size == byte_size) { + // Make sure the file and line match + if (udt.m_declaration == decl) { + // The type has the same name, and was defined on the same file and + // line. Now verify all of the parent DIEs match. + DWARFDIE parent_arg_die = die.GetParent(); + DWARFDIE parent_pos_die = udt.m_die.GetParent(); + bool match = true; + bool done = false; + while (!done && match && parent_arg_die && parent_pos_die) { + const dw_tag_t parent_arg_tag = parent_arg_die.Tag(); + const dw_tag_t parent_pos_tag = parent_pos_die.Tag(); + if (parent_arg_tag == parent_pos_tag) { + switch (parent_arg_tag) { + case DW_TAG_class_type: + case DW_TAG_structure_type: + case DW_TAG_union_type: + case DW_TAG_namespace: { + const char *parent_arg_die_name = parent_arg_die.GetName(); + if (parent_arg_die_name == + nullptr) // Anonymous (i.e. no-name) struct + { + match = false; + } else { + const char *parent_pos_die_name = parent_pos_die.GetName(); + if (parent_pos_die_name == nullptr || + ((parent_arg_die_name != parent_pos_die_name) && + strcmp(parent_arg_die_name, parent_pos_die_name))) + match = false; + } + } break; + + case DW_TAG_compile_unit: + case DW_TAG_partial_unit: + done = true; + break; + default: + break; + } } - } break; + parent_arg_die = parent_arg_die.GetParent(); + parent_pos_die = parent_pos_die.GetParent(); + } - case DW_TAG_compile_unit: - case DW_TAG_partial_unit: - done = true; - break; - default: - break; + if (match) { + entry = udt; + return true; } } - parent_arg_die = parent_arg_die.GetParent(); - parent_pos_die = parent_pos_die.GetParent(); - } - - if (match) { - return &udt; } } } - return nullptr; + return false; } diff --git a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h index 29e5c02..bf3cbae 100644 --- a/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h +++ b/lldb/source/Plugins/SymbolFile/DWARF/UniqueDWARFASTType.h @@ -23,19 +23,31 @@ public: // Constructors and Destructors UniqueDWARFASTType() : m_type_sp(), m_die(), m_declaration() {} + UniqueDWARFASTType(lldb::TypeSP &type_sp, const DWARFDIE &die, + const Declaration &decl, int32_t byte_size) + : m_type_sp(type_sp), m_die(die), m_declaration(decl), + m_byte_size(byte_size) {} + UniqueDWARFASTType(const UniqueDWARFASTType &rhs) : m_type_sp(rhs.m_type_sp), m_die(rhs.m_die), - m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size), - m_is_forward_declaration(rhs.m_is_forward_declaration) {} + m_declaration(rhs.m_declaration), m_byte_size(rhs.m_byte_size) {} ~UniqueDWARFASTType() = default; + UniqueDWARFASTType &operator=(const UniqueDWARFASTType &rhs) { + if (this != &rhs) { + m_type_sp = rhs.m_type_sp; + m_die = rhs.m_die; + m_declaration = rhs.m_declaration; + m_byte_size = rhs.m_byte_size; + } + return *this; + } + lldb::TypeSP m_type_sp; DWARFDIE m_die; Declaration m_declaration; int32_t m_byte_size = -1; - // True if the m_die is a forward declaration DIE. - bool m_is_forward_declaration = true; }; class UniqueDWARFASTTypeList { @@ -50,9 +62,8 @@ public: m_collection.push_back(entry); } - UniqueDWARFASTType *Find(const DWARFDIE &die, const Declaration &decl, - const int32_t byte_size, - bool is_forward_declaration); + bool Find(const DWARFDIE &die, const Declaration &decl, + const int32_t byte_size, UniqueDWARFASTType &entry) const; protected: typedef std::vector<UniqueDWARFASTType> collection; @@ -69,15 +80,14 @@ public: m_collection[name.GetCString()].Append(entry); } - UniqueDWARFASTType *Find(ConstString name, const DWARFDIE &die, - const Declaration &decl, const int32_t byte_size, - bool is_forward_declaration) { + bool Find(ConstString name, const DWARFDIE &die, const Declaration &decl, + const int32_t byte_size, UniqueDWARFASTType &entry) const { const char *unique_name_cstr = name.GetCString(); - collection::iterator pos = m_collection.find(unique_name_cstr); + collection::const_iterator pos = m_collection.find(unique_name_cstr); if (pos != m_collection.end()) { - return pos->second.Find(die, decl, byte_size, is_forward_declaration); + return pos->second.Find(die, decl, byte_size, entry); } - return nullptr; + return false; } protected: diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp index 194f89b..96d8322 100644 --- a/lldb/source/Symbol/Function.cpp +++ b/lldb/source/Symbol/Function.cpp @@ -220,17 +220,18 @@ Function *IndirectCallEdge::GetCallee(ModuleList &images, ExecutionContext &exe_ctx) { Log *log = GetLog(LLDBLog::Step); Status error; - Value callee_addr_val; - if (!call_target.Evaluate( - &exe_ctx, exe_ctx.GetRegisterContext(), LLDB_INVALID_ADDRESS, - /*initial_value_ptr=*/nullptr, - /*object_address_ptr=*/nullptr, callee_addr_val, &error)) { - LLDB_LOGF(log, "IndirectCallEdge: Could not evaluate expression: %s", - error.AsCString()); + llvm::Expected<Value> callee_addr_val = call_target.Evaluate( + &exe_ctx, exe_ctx.GetRegisterContext(), LLDB_INVALID_ADDRESS, + /*initial_value_ptr=*/nullptr, + /*object_address_ptr=*/nullptr); + if (!callee_addr_val) { + LLDB_LOG_ERROR(log, callee_addr_val.takeError(), + "IndirectCallEdge: Could not evaluate expression: {0}"); return nullptr; } - addr_t raw_addr = callee_addr_val.GetScalar().ULongLong(LLDB_INVALID_ADDRESS); + addr_t raw_addr = + callee_addr_val->GetScalar().ULongLong(LLDB_INVALID_ADDRESS); if (raw_addr == LLDB_INVALID_ADDRESS) { LLDB_LOG(log, "IndirectCallEdge: Could not extract address from scalar"); return nullptr; diff --git a/lldb/source/Target/RegisterContextUnwind.cpp b/lldb/source/Target/RegisterContextUnwind.cpp index e2d712c..95e8abd 100644 --- a/lldb/source/Target/RegisterContextUnwind.cpp +++ b/lldb/source/Target/RegisterContextUnwind.cpp @@ -1661,12 +1661,14 @@ RegisterContextUnwind::SavedLocationForRegister( unwindplan_registerkind); Value cfa_val = Scalar(m_cfa); cfa_val.SetValueType(Value::ValueType::LoadAddress); - Value result; - Status error; - if (dwarfexpr.Evaluate(&exe_ctx, this, 0, &cfa_val, nullptr, result, - &error)) { + llvm::Expected<Value> result = + dwarfexpr.Evaluate(&exe_ctx, this, 0, &cfa_val, nullptr); + if (!result) { + LLDB_LOG_ERROR(log, result.takeError(), + "DWARF expression failed to evaluate: {0}"); + } else { addr_t val; - val = result.GetScalar().ULongLong(); + val = result->GetScalar().ULongLong(); if (unwindplan_regloc.IsDWARFExpression()) { regloc.type = UnwindLLDB::RegisterLocation::eRegisterValueInferred; regloc.location.inferred_value = val; @@ -2029,11 +2031,10 @@ bool RegisterContextUnwind::ReadFrameAddress( DWARFExpressionList dwarfexpr(opcode_ctx, dwarfdata, nullptr); dwarfexpr.GetMutableExpressionAtAddress()->SetRegisterKind( row_register_kind); - Value result; - Status error; - if (dwarfexpr.Evaluate(&exe_ctx, this, 0, nullptr, nullptr, result, - &error)) { - address = result.GetScalar().ULongLong(); + llvm::Expected<Value> result = + dwarfexpr.Evaluate(&exe_ctx, this, 0, nullptr, nullptr); + if (result) { + address = result->GetScalar().ULongLong(); if (ABISP abi_sp = m_thread.GetProcess()->GetABI()) address = abi_sp->FixCodeAddress(address); @@ -2042,7 +2043,7 @@ bool RegisterContextUnwind::ReadFrameAddress( return true; } UnwindLogMsg("Failed to set CFA value via DWARF expression: %s", - error.AsCString()); + llvm::toString(result.takeError()).c_str()); break; } case UnwindPlan::Row::FAValue::isRaSearch: { diff --git a/lldb/source/Target/StackFrame.cpp b/lldb/source/Target/StackFrame.cpp index 246871d..3a2b4d0 100644 --- a/lldb/source/Target/StackFrame.cpp +++ b/lldb/source/Target/StackFrame.cpp @@ -1091,24 +1091,19 @@ bool StackFrame::GetFrameBaseValue(Scalar &frame_base, Status *error_ptr) { m_flags.Set(GOT_FRAME_BASE); ExecutionContext exe_ctx(shared_from_this()); - Value expr_value; addr_t loclist_base_addr = LLDB_INVALID_ADDRESS; if (!m_sc.function->GetFrameBaseExpression().IsAlwaysValidSingleExpr()) loclist_base_addr = m_sc.function->GetAddressRange().GetBaseAddress().GetLoadAddress( exe_ctx.GetTargetPtr()); - if (!m_sc.function->GetFrameBaseExpression().Evaluate( - &exe_ctx, nullptr, loclist_base_addr, nullptr, nullptr, - expr_value, &m_frame_base_error)) { - // We should really have an error if evaluate returns, but in case we - // don't, lets set the error to something at least. - if (m_frame_base_error.Success()) - m_frame_base_error.SetErrorString( - "Evaluation of the frame base expression failed."); - } else { - m_frame_base = expr_value.ResolveValue(&exe_ctx); - } + llvm::Expected<Value> expr_value = + m_sc.function->GetFrameBaseExpression().Evaluate( + &exe_ctx, nullptr, loclist_base_addr, nullptr, nullptr); + if (!expr_value) + m_frame_base_error = expr_value.takeError(); + else + m_frame_base = expr_value->ResolveValue(&exe_ctx); } else { m_frame_base_error.SetErrorString("No function in symbol context."); } diff --git a/lldb/source/Target/Statistics.cpp b/lldb/source/Target/Statistics.cpp index be08485..2a53000 100644 --- a/lldb/source/Target/Statistics.cpp +++ b/lldb/source/Target/Statistics.cpp @@ -355,14 +355,14 @@ llvm::json::Value DebuggerStats::ReportStatistics( } global_stats.try_emplace("targets", std::move(json_targets)); + ConstStringStats const_string_stats; + json::Object json_memory{ + {"strings", const_string_stats.ToJSON()}, + }; + global_stats.try_emplace("memory", std::move(json_memory)); if (!summary_only) { - ConstStringStats const_string_stats; - json::Object json_memory{ - {"strings", const_string_stats.ToJSON()}, - }; json::Value cmd_stats = debugger.GetCommandInterpreter().GetStatistics(); global_stats.try_emplace("modules", std::move(json_modules)); - global_stats.try_emplace("memory", std::move(json_memory)); global_stats.try_emplace("commands", std::move(cmd_stats)); } diff --git a/lldb/source/Target/ThreadPlanStepOverRange.cpp b/lldb/source/Target/ThreadPlanStepOverRange.cpp index 84f282f..3fe02e0 100644 --- a/lldb/source/Target/ThreadPlanStepOverRange.cpp +++ b/lldb/source/Target/ThreadPlanStepOverRange.cpp @@ -355,7 +355,7 @@ bool ThreadPlanStepOverRange::DoPlanExplainsStop(Event *event_ptr) { return_value = NextRangeBreakpointExplainsStop(stop_info_sp); } else { if (log) - log->PutCString("ThreadPlanStepInRange got asked if it explains the " + log->PutCString("ThreadPlanStepOverRange got asked if it explains the " "stop for some reason other than step."); return_value = false; } diff --git a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp index 91769e8..7ad72b4 100644 --- a/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp +++ b/lldb/test/API/functionalities/param_entry_vals/basic_entry_values/main.cpp @@ -52,9 +52,10 @@ __attribute__((noinline)) void func4_amb(int &sink, int x) { //% expect_cmd_failure=True) //% self.filecheck("expr sink", "main.cpp","-check-prefix=FUNC4-EXPR", //% expect_cmd_failure=True) - // FUNC4-EXPR-FAIL: couldn't get the value of variable x: Could not evaluate - // DW_OP_entry_value. FUNC4-EXPR: couldn't get the value of variable sink: - // Could not evaluate DW_OP_entry_value. + // clang-format off + // FUNC4-EXPR-FAIL: couldn't get the value of variable x: could not evaluate DW_OP_entry_value: no matching call site param found + // FUNC4-EXPR: couldn't get the value of variable sink: could not evaluate DW_OP_entry_value: no matching call site param found + // clang-format on } __attribute__((noinline)) void func5_amb() {} diff --git a/lldb/test/API/lang/c/inlines/Makefile b/lldb/test/API/lang/c/inlines/Makefile new file mode 100644 index 0000000..f9555f9 --- /dev/null +++ b/lldb/test/API/lang/c/inlines/Makefile @@ -0,0 +1,3 @@ +C_SOURCES := main.c + +include Makefile.rules diff --git a/lldb/test/API/lang/c/inlines/TestRedefinitionsInInlines.py b/lldb/test/API/lang/c/inlines/TestRedefinitionsInInlines.py index 024b9da..062fd88 100644 --- a/lldb/test/API/lang/c/inlines/TestRedefinitionsInInlines.py +++ b/lldb/test/API/lang/c/inlines/TestRedefinitionsInInlines.py @@ -1,14 +1,60 @@ -from lldbsuite.test import lldbinline -from lldbsuite.test import decorators - -lldbinline.MakeInlineTest( - __file__, - globals(), - [ - decorators.expectedFailureAll( - compiler="clang", - compiler_version=["<", "3.5"], - bugnumber="llvm.org/pr27845", +"""Test that inlined argument variables have their correct location in debuginfo""" + +import lldb +from lldbsuite.test.decorators import * +from lldbsuite.test.lldbtest import * +from lldbsuite.test import lldbutil + + +class TestRedefinitionsInInlines(TestBase): + # https://github.com/llvm/llvm-project/issues/28219 + @skipIf(compiler="clang", compiler_version=["<", "3.5"]) + def test(self): + self.source = "main.c" + self.build() + (target, process, thread, bp1) = lldbutil.run_to_source_breakpoint( + self, "first breakpoint", lldb.SBFileSpec(self.source, False) + ) + + bp2 = target.BreakpointCreateBySourceRegex( + "second breakpoint", lldb.SBFileSpec(self.source, False) + ) + bp3 = target.BreakpointCreateBySourceRegex( + "third breakpoint", lldb.SBFileSpec(self.source, False) ) - ], -) + + # When called from main(), test2 is passed in the value of 42 in 'b' + self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs=["42"]) + + process.Continue() + + self.assertState(process.GetState(), lldb.eStateStopped) + thread = lldbutil.get_stopped_thread(process, lldb.eStopReasonBreakpoint) + self.assertIsNotNone(thread) + bp_id = thread.GetStopReasonDataAtIndex(0) + self.assertEqual(bp_id, bp2.GetID()) + + self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs=["42"]) + self.expect("expression c", DATA_TYPES_DISPLAYED_CORRECTLY, substrs=["84"]) + + process.Continue() + + # Now we're in test1(), and the first thing it does is call test2(24). "Step in" + # and check that we have the value 24 as the argument. + self.assertState(process.GetState(), lldb.eStateStopped) + thread = lldbutil.get_stopped_thread(process, lldb.eStopReasonBreakpoint) + self.assertIsNotNone(thread) + bp_id = thread.GetStopReasonDataAtIndex(0) + self.assertEqual(bp_id, bp3.GetID()) + + frame = thread.GetFrameAtIndex(0) + self.assertTrue(frame.IsInlined()) + self.assertEqual(frame.GetFunctionName(), "test1") + + thread.StepInto() + + frame = thread.GetFrameAtIndex(0) + self.assertTrue(frame.IsInlined()) + self.assertEqual(frame.GetFunctionName(), "test2") + + self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs=["24"]) diff --git a/lldb/test/API/lang/c/inlines/main.c b/lldb/test/API/lang/c/inlines/main.c index 8fe4918..6ecc04d 100644 --- a/lldb/test/API/lang/c/inlines/main.c +++ b/lldb/test/API/lang/c/inlines/main.c @@ -3,23 +3,22 @@ inline void test1(int) __attribute__ ((always_inline)); inline void test2(int) __attribute__ ((always_inline)); +// Called once from main with b==42 then called from test1 with b==24. void test2(int b) { - printf("test2(%d)\n", b); //% self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["42"]) - { - int c = b * 2; - printf("c=%d\n", c); //% self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["42"]) - //% self.expect("expression c", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["84"]) - } + printf("test2(%d)\n", b); // first breakpoint + { + int c = b * 2; + printf("c=%d\n", c); // second breakpoint + } } void test1(int a) { printf("test1(%d)\n", a); - test2(a+1);//% self.runCmd("step") - //% self.expect("expression b", DATA_TYPES_DISPLAYED_CORRECTLY, substrs = ["24"]) + test2(a + 1); // third breakpoint } -int main() { - test2(42); - test1(23); - return 0; +int main(int argc) { + test2(42); + test1(23); + return 0; } diff --git a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py index dc7f4f9..42a95de 100644 --- a/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py +++ b/lldb/test/API/tools/lldb-dap/optimized/TestDAP_optimized.py @@ -47,6 +47,6 @@ class TestDAP_optimized(lldbdap_testcase.DAPTestCaseBase): self.assertTrue(optimized_variable["value"].startswith("<error:")) error_msg = optimized_variable["$__lldb_extensions"]["error"] self.assertTrue( - ("Could not evaluate DW_OP_entry_value" in error_msg) + ("could not evaluate DW_OP_entry_value: no parent function" in error_msg) or ("variable not available" in error_msg) ) diff --git a/lldb/test/API/tools/lldb-server/TestPtyServer.py b/lldb/test/API/tools/lldb-server/TestPtyServer.py index 4bfcf70..345f68f 100644 --- a/lldb/test/API/tools/lldb-server/TestPtyServer.py +++ b/lldb/test/API/tools/lldb-server/TestPtyServer.py @@ -7,6 +7,7 @@ from lldbgdbserverutils import * import xml.etree.ElementTree as ET +@skipIfRemote @skipIf(hostoslist=["windows"]) class PtyServerTestCase(gdbremote_testcase.GdbRemoteTestCaseBase): def setUp(self): diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/app_specific_backtrace_crashlog.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/app_specific_backtrace_crashlog.test index c57cefd..9c0510c 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/app_specific_backtrace_crashlog.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/app_specific_backtrace_crashlog.test @@ -1,9 +1,9 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: mkdir -p %t.dir # RUN: yaml2obj %S/Inputs/application_specific_info/asi.yaml > %t.dir/asi # RUN: %lldb -o 'command script import lldb.macosx.crashlog' \ -# RUN: -o 'crashlog -a -i -t %t.dir/asi %S/Inputs/application_specific_info/asi.txt' \ +# RUN: -o 'crashlog -i -t %t.dir/asi %S/Inputs/application_specific_info/asi.txt' \ # RUN: -o "thread list" -o "bt all" 2>&1 | FileCheck %s # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_invalid_target.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_invalid_target.test index abd1e7c..eb1f5f4 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_invalid_target.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_invalid_target.test @@ -1,4 +1,4 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: %lldb -o 'command script import lldb.macosx.crashlog' \ # RUN: -o 'crashlog -V' \ diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_json.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_json.test index fccd71c..684be28 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_json.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_json.test @@ -1,4 +1,4 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: mkdir -p %t.dir # RUN: yaml2obj %S/Inputs/interactive_crashlog/multithread-test.yaml > %t.dir/multithread-test diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_legacy.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_legacy.test index 6e2826e..271a4c2 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_legacy.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/interactive_crashlog_legacy.test @@ -1,4 +1,4 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: mkdir -p %t.dir # RUN: yaml2obj %S/Inputs/interactive_crashlog/multithread-test.yaml > %t.dir/multithread-test diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/last_exception_backtrace_crashlog.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/last_exception_backtrace_crashlog.test index c2f6196..a17b7ac 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/last_exception_backtrace_crashlog.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/last_exception_backtrace_crashlog.test @@ -1,9 +1,9 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: mkdir -p %t.dir # RUN: yaml2obj %S/Inputs/application_specific_info/asi.yaml > %t.dir/asi # RUN: %lldb -o 'command script import lldb.macosx.crashlog' \ -# RUN: -o 'crashlog -a -i -t %t.dir/asi %S/Inputs/application_specific_info/leb.txt' \ +# RUN: -o 'crashlog -i -t %t.dir/asi %S/Inputs/application_specific_info/leb.txt' \ # RUN: -o "thread list" -o "bt all" 2>&1 | FileCheck %s # CHECK: "crashlog" {{.*}} commands have been installed, use the "--help" options on these commands diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg index 4170696..b72b294 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/lit.local.cfg @@ -3,3 +3,8 @@ if 'system-darwin' not in config.available_features: if 'lldb-repro' in config.available_features: config.unsupported = True + +config.environment["LLDB_APPLE_DSYMFORUUID_EXECUTABLE"] = "" + +# Temporary parallel image loading deadlock workaround +config.environment["NO_PARALLEL_IMG_LOADING"] = "" diff --git a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test index 81e0686..64cd090 100644 --- a/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test +++ b/lldb/test/Shell/ScriptInterpreter/Python/Crashlog/skipped_status_interactive_crashlog.test @@ -1,4 +1,4 @@ -# REQUIRES: python, native && target-aarch64 && system-darwin +# REQUIRES: python, native && system-darwin # RUN: mkdir -p %t.dir # RUN: yaml2obj %S/Inputs/interactive_crashlog/multithread-test.yaml > %t.dir/multithread-test diff --git a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test b/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test deleted file mode 100644 index d253981..0000000 --- a/lldb/test/Shell/SymbolFile/DWARF/delayed-definition-die-searching.test +++ /dev/null @@ -1,36 +0,0 @@ -# Test definition DIE searching is delayed until complete type is required. - -# UNSUPPORTED: system-windows - -# RUN: split-file %s %t -# RUN: %clangxx_host %t/main.cpp %t/t1_def.cpp -gdwarf -o %t.out -# RUN: %lldb -b %t.out -s %t/lldb.cmd | FileCheck %s - -# CHECK: (lldb) p v1 -# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't2<t1>' -# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' -# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't2<t1>' resolving forward declaration... -# CHECK: (t2<t1>) {} -# CHECK: (lldb) p v2 -# CHECK: DWARFASTParserClang::ParseTypeFromDWARF{{.*}}DW_TAG_structure_type (DW_TAG_structure_type) name = 't1' -# CHECK: DW_TAG_structure_type (DW_TAG_structure_type) 't1' resolving forward declaration... - -#--- lldb.cmd -log enable dwarf comp -p v1 -p v2 - -#--- main.cpp -template<typename T> -struct t2 { -}; -struct t1; -t2<t1> v1; // this CU doesn't have definition DIE for t1, but only declaration DIE for it. -int main() { -} - -#--- t1_def.cpp -struct t1 { // this CU contains definition DIE for t1. - int x; -}; -t1 v2; diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp new file mode 100644 index 0000000..a8a4d3b --- /dev/null +++ b/lldb/test/Shell/SymbolFile/DWARF/x86/simple-template-names-context.cpp @@ -0,0 +1,44 @@ +// Test that we can correctly resolve forward declared types when they only +// differ in the template arguments of the surrounding context. The reproducer +// is sensitive to the order of declarations, so we test in both directions. + +// REQUIRES: lld + +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-a.o -g -gsimple-template-names -DFILE_A +// RUN: %clang --target=x86_64-pc-linux -c %s -o %t-b.o -g -gsimple-template-names -DFILE_B +// RUN: ld.lld %t-a.o %t-b.o -o %t +// RUN: %lldb %t -o "target variable --ptr-depth 1 --show-types both_a both_b" -o exit | FileCheck %s + +// CHECK: (lldb) target variable +// CHECK-NEXT: (ReferencesBoth<'A'>) both_a = { +// CHECK-NEXT: (Outer<'A'>::Inner *) a = 0x{{[0-9A-Fa-f]*}} {} +// CHECK-NEXT: (Outer<'A'>::Inner *) b = 0x{{[0-9A-Fa-f]*}} {} +// CHECK-NEXT: } +// CHECK-NEXT: (ReferencesBoth<'B'>) both_b = { +// CHECK-NEXT: (Outer<'A'>::Inner *) a = 0x{{[0-9A-Fa-f]*}} {} +// CHECK-NEXT: (Outer<'B'>::Inner *) b = 0x{{[0-9A-Fa-f]*}} {} +// CHECK-NEXT: } + +template<char C> +struct Outer { + struct Inner {}; +}; + +template<char C> +struct ReferencesBoth { + Outer<'A'>::Inner *a; + Outer<'B'>::Inner *b; +}; + +#ifdef FILE_A +Outer<'A'>::Inner inner_a; +extern Outer<'B'>::Inner inner_b; + +ReferencesBoth<'A'> both_a{&inner_a, &inner_b}; + +#else +extern Outer<'A'>::Inner inner_a; +Outer<'B'>::Inner inner_b; + +ReferencesBoth<'B'> both_b{&inner_a, &inner_b}; +#endif diff --git a/lldb/unittests/Expression/DWARFExpressionTest.cpp b/lldb/unittests/Expression/DWARFExpressionTest.cpp index 602bd19..f9e0605 100644 --- a/lldb/unittests/Expression/DWARFExpressionTest.cpp +++ b/lldb/unittests/Expression/DWARFExpressionTest.cpp @@ -33,23 +33,23 @@ static llvm::Expected<Scalar> Evaluate(llvm::ArrayRef<uint8_t> expr, ExecutionContext *exe_ctx = nullptr) { DataExtractor extractor(expr.data(), expr.size(), lldb::eByteOrderLittle, /*addr_size*/ 4); - Value result; - Status status; - if (!DWARFExpression::Evaluate(exe_ctx, /*reg_ctx*/ nullptr, module_sp, - extractor, unit, lldb::eRegisterKindLLDB, - /*initial_value_ptr*/ nullptr, - /*object_address_ptr*/ nullptr, result, - &status)) - return status.ToError(); - - switch (result.GetValueType()) { + + llvm::Expected<Value> result = + DWARFExpression::Evaluate(exe_ctx, /*reg_ctx*/ nullptr, module_sp, + extractor, unit, lldb::eRegisterKindLLDB, + /*initial_value_ptr*/ nullptr, + /*object_address_ptr*/ nullptr); + if (!result) + return result.takeError(); + + switch (result->GetValueType()) { case Value::ValueType::Scalar: - return result.GetScalar(); + return result->GetScalar(); case Value::ValueType::LoadAddress: return LLDB_INVALID_ADDRESS; case Value::ValueType::HostAddress: { // Convert small buffers to scalars to simplify the tests. - DataBufferHeap &buf = result.GetBuffer(); + DataBufferHeap &buf = result->GetBuffer(); if (buf.GetByteSize() <= 8) { uint64_t val = 0; memcpy(&val, buf.GetBytes(), buf.GetByteSize()); @@ -58,8 +58,9 @@ static llvm::Expected<Scalar> Evaluate(llvm::ArrayRef<uint8_t> expr, } [[fallthrough]]; default: - return status.ToError(); + break; } + return llvm::createStringError("unsupported value type"); } class DWARFExpressionTester : public YAMLModuleTester { @@ -454,16 +455,15 @@ TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr) { uint8_t expr[] = {DW_OP_addr, 0x40, 0x0, 0x0, 0x0}; DataExtractor extractor(expr, sizeof(expr), lldb::eByteOrderLittle, /*addr_size*/ 4); - Value result; - Status status; - ASSERT_TRUE(DWARFExpression::Evaluate( + + llvm::Expected<Value> result = DWARFExpression::Evaluate( &exe_ctx, /*reg_ctx*/ nullptr, /*module_sp*/ {}, extractor, /*unit*/ nullptr, lldb::eRegisterKindLLDB, /*initial_value_ptr*/ nullptr, - /*object_address_ptr*/ nullptr, result, &status)) - << status.ToError(); + /*object_address_ptr*/ nullptr); - ASSERT_EQ(result.GetValueType(), Value::ValueType::LoadAddress); + ASSERT_THAT_EXPECTED(result, llvm::Succeeded()); + ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress); } TEST_F(DWARFExpressionMockProcessTest, WASM_DW_OP_addr_index) { @@ -530,14 +530,14 @@ DWARF: ExecutionContext exe_ctx(target_sp, false); - auto evaluate = [&](DWARFExpression &expr, Status &status, Value &result) { + auto evaluate = [&](DWARFExpression &expr) -> llvm::Expected<Value> { DataExtractor extractor; expr.GetExpressionData(extractor); - return DWARFExpression::Evaluate( - &exe_ctx, /*reg_ctx*/ nullptr, /*module_sp*/ {}, extractor, dwarf_cu, - lldb::eRegisterKindLLDB, - /*initial_value_ptr*/ nullptr, - /*object_address_ptr*/ nullptr, result, &status); + return DWARFExpression::Evaluate(&exe_ctx, /*reg_ctx*/ nullptr, + /*module_sp*/ {}, extractor, dwarf_cu, + lldb::eRegisterKindLLDB, + /*initial_value_ptr*/ nullptr, + /*object_address_ptr*/ nullptr); }; // DW_OP_addrx takes a single leb128 operand, the index in the addr table: @@ -546,16 +546,16 @@ DWARF: /*addr_size*/ 4); DWARFExpression expr(extractor); - Status status; - Value result; - ASSERT_TRUE(evaluate(expr, status, result)) << status.ToError(); - ASSERT_EQ(result.GetValueType(), Value::ValueType::LoadAddress); - ASSERT_EQ(result.GetScalar().UInt(), 0x5678u); + llvm::Expected<Value> result = evaluate(expr); + ASSERT_THAT_EXPECTED(result, llvm::Succeeded()); + ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress); + ASSERT_EQ(result->GetScalar().UInt(), 0x5678u); ASSERT_TRUE(expr.Update_DW_OP_addr(dwarf_cu, 0xdeadbeef)); - ASSERT_TRUE(evaluate(expr, status, result)) << status.ToError(); - ASSERT_EQ(result.GetValueType(), Value::ValueType::LoadAddress); - ASSERT_EQ(result.GetScalar().UInt(), 0xdeadbeefu); + result = evaluate(expr); + ASSERT_THAT_EXPECTED(result, llvm::Succeeded()); + ASSERT_EQ(result->GetValueType(), Value::ValueType::LoadAddress); + ASSERT_EQ(result->GetScalar().UInt(), 0xdeadbeefu); } class CustomSymbolFileDWARF : public SymbolFileDWARF { @@ -825,15 +825,13 @@ TEST_F(DWARFExpressionMockProcessTest, DW_OP_piece_file_addr) { DW_OP_addr, 0x50, 0x0, 0x0, 0x0, DW_OP_piece, 1}; DataExtractor extractor(expr, sizeof(expr), lldb::eByteOrderLittle, /*addr_size*/ 4); - Value result; - Status status; - ASSERT_TRUE(DWARFExpression::Evaluate( + llvm::Expected<Value> result = DWARFExpression::Evaluate( &exe_ctx, /*reg_ctx*/ nullptr, /*module_sp*/ {}, extractor, /*unit*/ nullptr, lldb::eRegisterKindLLDB, /*initial_value_ptr*/ nullptr, - /*object_address_ptr*/ nullptr, result, &status)) - << status.ToError(); + /*object_address_ptr*/ nullptr); - ASSERT_EQ(result.GetValueType(), Value::ValueType::HostAddress); - ASSERT_THAT(result.GetBuffer().GetData(), ElementsAre(0x11, 0x22)); + ASSERT_THAT_EXPECTED(result, llvm::Succeeded()); + ASSERT_EQ(result->GetValueType(), Value::ValueType::HostAddress); + ASSERT_THAT(result->GetBuffer().GetData(), ElementsAre(0x11, 0x22)); } diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index bb67510..aa50ce3 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -504,6 +504,13 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following work-item Add product IDs names. + ``gfx1152`` ``amdgcn`` APU - cumode - Architected *TBA* + - wavefrontsize64 flat + scratch .. TODO:: + - Packed + work-item Add product + IDs names. + ``gfx1200`` ``amdgcn`` dGPU - cumode - Architected *TBA* - wavefrontsize64 flat scratch .. TODO:: @@ -591,11 +598,13 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``gfx1102`` - Packed hazards specific to some targets - ``gfx1103`` work-item within this family. - ``gfx1150`` IDs - - ``gfx1151`` Not all VGPRs can be used on: + - ``gfx1151`` + - ``gfx1152`` Not all VGPRs can be used on: - ``gfx1100`` - ``gfx1101`` - ``gfx1151`` + - ``gfx1152`` SALU floating point instructions and single-use VGPR hint @@ -604,12 +613,14 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``gfx1150`` - ``gfx1151`` + - ``gfx1152`` SGPRs are not supported for src1 in dpp instructions for: - ``gfx1150`` - ``gfx1151`` + - ``gfx1152`` ``gfx12-generic`` ``amdgcn`` - ``gfx1200`` - wavefrontsize64 - Architected No restrictions. @@ -1979,7 +1990,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC`` 0x052 ``gfx10-1-generic`` ``EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC`` 0x053 ``gfx10-3-generic`` ``EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC`` 0x054 ``gfx11-generic`` - *reserved* 0x055 Reserved. + ``EF_AMDGPU_MACH_AMDGCN_GFX1152`` 0x055 ``gfx1152``. *reserved* 0x056 Reserved. *reserved* 0x057 Reserved. *reserved* 0x058 Reserved. diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst index 393b97e..c4a8562 100644 --- a/llvm/docs/ReleaseNotes.rst +++ b/llvm/docs/ReleaseNotes.rst @@ -116,6 +116,10 @@ Changes to the Hexagon Backend Changes to the LoongArch Backend -------------------------------- +* i32 is now a native type in the datalayout string. This enables + LoopStrengthReduce for loops with i32 induction variables, among other + optimizations. + Changes to the MIPS Backend --------------------------- diff --git a/llvm/include/llvm/Analysis/CodeMetrics.h b/llvm/include/llvm/Analysis/CodeMetrics.h index a9431bc..d09018d 100644 --- a/llvm/include/llvm/Analysis/CodeMetrics.h +++ b/llvm/include/llvm/Analysis/CodeMetrics.h @@ -20,12 +20,15 @@ namespace llvm { class AssumptionCache; class BasicBlock; +class Instruction; class Loop; class Function; template <class T> class SmallPtrSetImpl; class TargetTransformInfo; class Value; +enum struct ConvergenceKind { None, Controlled, ExtendedLoop, Uncontrolled }; + /// Utility to calculate the size and a few similar metrics for a set /// of basic blocks. struct CodeMetrics { @@ -42,8 +45,8 @@ struct CodeMetrics { /// one or more 'noduplicate' instructions. bool notDuplicatable = false; - /// True if this function contains a call to a convergent function. - bool convergent = false; + /// The kind of convergence specified in this function. + ConvergenceKind Convergence = ConvergenceKind::None; /// True if this function calls alloca (in the C sense). bool usesDynamicAlloca = false; @@ -77,7 +80,7 @@ struct CodeMetrics { /// Add information about a block to the current state. void analyzeBasicBlock(const BasicBlock *BB, const TargetTransformInfo &TTI, const SmallPtrSetImpl<const Value *> &EphValues, - bool PrepareForLTO = false); + bool PrepareForLTO = false, const Loop *L = nullptr); /// Collect a loop's ephemeral values (those used only by an assume /// or similar intrinsics in the loop). diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h index 5208463..4f06a7e 100644 --- a/llvm/include/llvm/Analysis/LoopInfo.h +++ b/llvm/include/llvm/Analysis/LoopInfo.h @@ -649,6 +649,9 @@ int getIntLoopAttribute(const Loop *TheLoop, StringRef Name, int Default = 0); std::optional<const MDOperand *> findStringMetadataForLoop(const Loop *TheLoop, StringRef Name); +/// Find the convergence heart of the loop. +CallBase *getLoopConvergenceHeart(const Loop *TheLoop); + /// Look for the loop attribute that requires progress within the loop. /// Note: Most consumers probably want "isMustProgress" which checks /// the containing function attribute too. diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def index e12eb70..4021897 100644 --- a/llvm/include/llvm/Analysis/VecFuncs.def +++ b/llvm/include/llvm/Analysis/VecFuncs.def @@ -49,6 +49,7 @@ TLI_DEFINE_VECFUNC("llvm.sin.f32", "vsinf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("cosf", "vcosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.cos.f32", "vcosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("tanf", "vtanf", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "vtanf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("asinf", "vasinf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("acosf", "vacosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("atanf", "vatanf", FIXED(4), "_ZGV_LLVM_N4v") @@ -142,6 +143,18 @@ TLI_DEFINE_VECFUNC("llvm.cos.f64", "_ZGVdN4v_cos", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVbN4v_cosf", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.cos.f32", "_ZGVdN8v_cosf", FIXED(8), "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("tan", "_ZGVbN2v_tan", FIXED(2), "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("tan", "_ZGVdN4v_tan", FIXED(4), "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("tanf", "_ZGVbN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tanf", "_ZGVdN8v_tanf", FIXED(8), "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVbN2v_tan", FIXED(2), "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "_ZGVdN4v_tan", FIXED(4), "_ZGV_LLVM_N4v") + +TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVbN4v_tanf", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "_ZGVdN8v_tanf", FIXED(8), "_ZGV_LLVM_N8v") + TLI_DEFINE_VECFUNC("pow", "_ZGVbN2vv_pow", FIXED(2), "_ZGV_LLVM_N2vv") TLI_DEFINE_VECFUNC("pow", "_ZGVdN4vv_pow", FIXED(4), "_ZGV_LLVM_N4vv") @@ -303,6 +316,22 @@ TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf4", FIXED(4), "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf8", FIXED(8), "_ZGV_LLVM_N8v") TLI_DEFINE_VECFUNC("llvm.cos.f32", "__svml_cosf16", FIXED(16), "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("tan", "__svml_tan2", FIXED(2), "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("tan", "__svml_tan4", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tan", "__svml_tan8", FIXED(8), "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("tanf", "__svml_tanf4", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("tanf", "__svml_tanf8", FIXED(8), "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("tanf", "__svml_tanf16", FIXED(16), "_ZGV_LLVM_N16v") + +TLI_DEFINE_VECFUNC("llvm.tan.f64", "__svml_tan2", FIXED(2), "_ZGV_LLVM_N2v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "__svml_tan4", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "__svml_tan8", FIXED(8), "_ZGV_LLVM_N8v") + +TLI_DEFINE_VECFUNC("llvm.tan.f32", "__svml_tanf4", FIXED(4), "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "__svml_tanf8", FIXED(8), "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "__svml_tanf16", FIXED(16), "_ZGV_LLVM_N16v") + TLI_DEFINE_VECFUNC("pow", "__svml_pow2", FIXED(2), "_ZGV_LLVM_N2vv") TLI_DEFINE_VECFUNC("pow", "__svml_pow4", FIXED(4), "_ZGV_LLVM_N4vv") TLI_DEFINE_VECFUNC("pow", "__svml_pow8", FIXED(8), "_ZGV_LLVM_N8vv") @@ -1237,6 +1266,13 @@ TLI_DEFINE_VECFUNC("tanf", "amd_vrs4_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("tanf", "amd_vrs8_tanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") TLI_DEFINE_VECFUNC("tanf", "amd_vrs16_tanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "amd_vrs16_tanf", FIXED(16), NOMASK, "_ZGV_LLVM_N16v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "amd_vrs8_tanf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.tan.f32", "amd_vrs4_tanf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "amd_vrd8_tan", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "amd_vrd4_tan", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") +TLI_DEFINE_VECFUNC("llvm.tan.f64", "amd_vrd2_tan", FIXED(2), NOMASK, "_ZGV_LLVM_N2v") + TLI_DEFINE_VECFUNC("asin", "amd_vrd8_asin", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") TLI_DEFINE_VECFUNC("asinf", "amd_vrs4_asinf", FIXED(4), NOMASK, "_ZGV_LLVM_N4v") TLI_DEFINE_VECFUNC("asinf", "amd_vrs8_asinf", FIXED(8), NOMASK, "_ZGV_LLVM_N8v") diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h index 69821c2..db6780b 100644 --- a/llvm/include/llvm/AsmParser/LLToken.h +++ b/llvm/include/llvm/AsmParser/LLToken.h @@ -147,6 +147,7 @@ enum Kind { kw_aarch64_vector_pcs, kw_aarch64_sve_vector_pcs, kw_aarch64_sme_preservemost_from_x0, + kw_aarch64_sme_preservemost_from_x1, kw_aarch64_sme_preservemost_from_x2, kw_msp430_intrcc, kw_avr_intrcc, diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 7364d61..dfba180 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -795,7 +795,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX10_1_GENERIC = 0x052, EF_AMDGPU_MACH_AMDGCN_GFX10_3_GENERIC = 0x053, EF_AMDGPU_MACH_AMDGCN_GFX11_GENERIC = 0x054, - EF_AMDGPU_MACH_AMDGCN_RESERVED_0X55 = 0x055, + EF_AMDGPU_MACH_AMDGCN_GFX1152 = 0x055, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X56 = 0x056, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X57 = 0x057, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X58 = 0x058, diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h index ef4e0fd..9f8d3de 100644 --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -1977,6 +1977,9 @@ public: case Intrinsic::cos: ISD = ISD::FCOS; break; + case Intrinsic::tan: + ISD = ISD::FTAN; + break; case Intrinsic::exp: ISD = ISD::FEXP; break; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index 12e5b31..4365956 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -871,6 +871,14 @@ public: bool matchFreezeOfSingleMaybePoisonOperand(MachineInstr &MI, BuildFnTy &MatchInfo); + bool matchAddOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo); + + bool matchMulOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo); + + bool matchSubOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo); + + bool matchShlOfVScale(const MachineOperand &MO, BuildFnTy &MatchInfo); + private: /// Checks for legality of an indexed variant of \p LdSt. bool isIndexedLoadStoreLegal(GLoadStore &LdSt) const; diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h index 2b3efc3..2273725 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h @@ -14,10 +14,12 @@ #ifndef LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H #define LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H -#include "llvm/IR/Instructions.h" +#include "llvm/ADT/APInt.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/Casting.h" namespace llvm { @@ -856,6 +858,43 @@ public: }; }; +/// Represents a vscale. +class GVScale : public GenericMachineInstr { +public: + APInt getSrc() const { return getOperand(1).getCImm()->getValue(); } + + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_VSCALE; + }; +}; + +/// Represents an integer subtraction. +class GSub : public GIntBinOp { +public: + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_SUB; + }; +}; + +/// Represents an integer multiplication. +class GMul : public GIntBinOp { +public: + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_MUL; + }; +}; + +/// Represents a shift left. +class GShl : public GenericMachineInstr { +public: + Register getSrcReg() const { return getOperand(1).getReg(); } + Register getShiftReg() const { return getOperand(2).getReg(); } + + static bool classof(const MachineInstr *MI) { + return MI->getOpcode() == TargetOpcode::G_SHL; + }; +}; + } // namespace llvm #endif // LLVM_CODEGEN_GLOBALISEL_GENERICMACHINEINSTRS_H diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 0f87e06..c8c86ed 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -415,6 +415,7 @@ enum NodeType { STRICT_FLDEXP, STRICT_FSIN, STRICT_FCOS, + STRICT_FTAN, STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, @@ -934,6 +935,7 @@ enum NodeType { FCBRT, FSIN, FCOS, + FTAN, FPOW, FPOWI, /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1). diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h index 9f8e846c..6e7292a 100644 --- a/llvm/include/llvm/CodeGen/MachineFunction.h +++ b/llvm/include/llvm/CodeGen/MachineFunction.h @@ -18,7 +18,6 @@ #define LLVM_CODEGEN_MACHINEFUNCTION_H #include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/GraphTraits.h" #include "llvm/ADT/SmallVector.h" @@ -34,6 +33,7 @@ #include "llvm/Support/Compiler.h" #include "llvm/Support/Recycler.h" #include "llvm/Target/TargetOptions.h" +#include <bitset> #include <cassert> #include <cstdint> #include <memory> @@ -217,22 +217,21 @@ public: } MachineFunctionProperties &reset(const MachineFunctionProperties &MFP) { - Properties.reset(MFP.Properties); + Properties &= ~MFP.Properties; return *this; } // Returns true if all properties set in V (i.e. required by a pass) are set // in this. bool verifyRequiredProperties(const MachineFunctionProperties &V) const { - return !V.Properties.test(Properties); + return (Properties | ~V.Properties).all(); } /// Print the MachineFunctionProperties in human-readable form. void print(raw_ostream &OS) const; private: - BitVector Properties = - BitVector(static_cast<unsigned>(Property::LastProperty)+1); + std::bitset<static_cast<unsigned>(Property::LastProperty) + 1> Properties; }; struct SEHHandler { diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h index 7970441..071a27a 100644 --- a/llvm/include/llvm/CodeGen/SDPatternMatch.h +++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h @@ -330,9 +330,7 @@ template <typename... Preds> struct And { template <typename Pred, typename... Preds> struct And<Pred, Preds...> : And<Preds...> { Pred P; - And(Pred &&p, Preds &&...preds) - : And<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) { - } + And(const Pred &p, const Preds &...preds) : And<Preds...>(preds...), P(p) {} template <typename MatchContext> bool match(const MatchContext &Ctx, SDValue N) { @@ -349,8 +347,7 @@ template <typename... Preds> struct Or { template <typename Pred, typename... Preds> struct Or<Pred, Preds...> : Or<Preds...> { Pred P; - Or(Pred &&p, Preds &&...preds) - : Or<Preds...>(std::forward<Preds>(preds)...), P(std::forward<Pred>(p)) {} + Or(const Pred &p, const Preds &...preds) : Or<Preds...>(preds...), P(p) {} template <typename MatchContext> bool match(const MatchContext &Ctx, SDValue N) { @@ -376,16 +373,16 @@ template <typename Pred> inline Not<Pred> m_Unless(const Pred &P) { return Not{P}; } -template <typename... Preds> And<Preds...> m_AllOf(Preds &&...preds) { - return And<Preds...>(std::forward<Preds>(preds)...); +template <typename... Preds> And<Preds...> m_AllOf(const Preds &...preds) { + return And<Preds...>(preds...); } -template <typename... Preds> Or<Preds...> m_AnyOf(Preds &&...preds) { - return Or<Preds...>(std::forward<Preds>(preds)...); +template <typename... Preds> Or<Preds...> m_AnyOf(const Preds &...preds) { + return Or<Preds...>(preds...); } -template <typename... Preds> auto m_NoneOf(Preds &&...preds) { - return m_Unless(m_AnyOf(std::forward<Preds>(preds)...)); +template <typename... Preds> auto m_NoneOf(const Preds &...preds) { + return m_Unless(m_AnyOf(preds...)); } // === Generic node matching === @@ -402,10 +399,8 @@ struct Operands_match<OpIdx, OpndPred, OpndPreds...> : Operands_match<OpIdx + 1, OpndPreds...> { OpndPred P; - Operands_match(OpndPred &&p, OpndPreds &&...preds) - : Operands_match<OpIdx + 1, OpndPreds...>( - std::forward<OpndPreds>(preds)...), - P(std::forward<OpndPred>(p)) {} + Operands_match(const OpndPred &p, const OpndPreds &...preds) + : Operands_match<OpIdx + 1, OpndPreds...>(preds...), P(p) {} template <typename MatchContext> bool match(const MatchContext &Ctx, SDValue N) { @@ -419,9 +414,8 @@ struct Operands_match<OpIdx, OpndPred, OpndPreds...> }; template <typename... OpndPreds> -auto m_Node(unsigned Opcode, OpndPreds &&...preds) { - return m_AllOf(m_Opc(Opcode), Operands_match<0, OpndPreds...>( - std::forward<OpndPreds>(preds)...)); +auto m_Node(unsigned Opcode, const OpndPreds &...preds) { + return m_AllOf(m_Opc(Opcode), Operands_match<0, OpndPreds...>(preds...)); } /// Provide number of operands that are not chain or glue, as well as the first @@ -647,10 +641,9 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_ZExt(const Opnd &Op) { return UnaryOpc_match<Opnd>(ISD::ZERO_EXTEND, Op); } -template <typename Opnd> inline auto m_SExt(Opnd &&Op) { - return m_AnyOf( - UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op), - m_Node(ISD::SIGN_EXTEND_INREG, std::forward<Opnd>(Op), m_Value())); +template <typename Opnd> inline auto m_SExt(const Opnd &Op) { + return m_AnyOf(UnaryOpc_match<Opnd>(ISD::SIGN_EXTEND, Op), + m_Node(ISD::SIGN_EXTEND_INREG, Op, m_Value())); } template <typename Opnd> inline UnaryOpc_match<Opnd> m_AnyExt(const Opnd &Op) { @@ -663,30 +656,28 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_Trunc(const Opnd &Op) { /// Match a zext or identity /// Allows to peek through optional extensions -template <typename Opnd> inline auto m_ZExtOrSelf(Opnd &&Op) { - return m_AnyOf(m_ZExt(std::forward<Opnd>(Op)), std::forward<Opnd>(Op)); +template <typename Opnd> inline auto m_ZExtOrSelf(const Opnd &Op) { + return m_AnyOf(m_ZExt(Op), Op); } /// Match a sext or identity /// Allows to peek through optional extensions -template <typename Opnd> inline auto m_SExtOrSelf(Opnd &&Op) { - return m_AnyOf(m_SExt(std::forward<Opnd>(Op)), std::forward<Opnd>(Op)); +template <typename Opnd> inline auto m_SExtOrSelf(const Opnd &Op) { + return m_AnyOf(m_SExt(Op), Op); } /// Match a aext or identity /// Allows to peek through optional extensions template <typename Opnd> -inline Or<UnaryOpc_match<Opnd>, Opnd> m_AExtOrSelf(Opnd &&Op) { - return Or<UnaryOpc_match<Opnd>, Opnd>(m_AnyExt(std::forward<Opnd>(Op)), - std::forward<Opnd>(Op)); +inline Or<UnaryOpc_match<Opnd>, Opnd> m_AExtOrSelf(const Opnd &Op) { + return Or<UnaryOpc_match<Opnd>, Opnd>(m_AnyExt(Op), Op); } /// Match a trunc or identity /// Allows to peek through optional truncations template <typename Opnd> -inline Or<UnaryOpc_match<Opnd>, Opnd> m_TruncOrSelf(Opnd &&Op) { - return Or<UnaryOpc_match<Opnd>, Opnd>(m_Trunc(std::forward<Opnd>(Op)), - std::forward<Opnd>(Op)); +inline Or<UnaryOpc_match<Opnd>, Opnd> m_TruncOrSelf(const Opnd &Op) { + return Or<UnaryOpc_match<Opnd>, Opnd>(m_Trunc(Op), Op); } // === Constants === diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index 48cb0cd..7b0e5e7 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1893,7 +1893,8 @@ public: const SDNode *N2); SDValue FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, EVT VT, - ArrayRef<SDValue> Ops); + ArrayRef<SDValue> Ops, + SDNodeFlags Flags = SDNodeFlags()); /// Fold floating-point operations when all operands are constants and/or /// undefined. diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index d1912b1..aa7a32e 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -96,7 +96,7 @@ class Value; namespace Sched { -enum Preference { +enum Preference : uint8_t { None, // No preference Source, // Follow source order. RegPressure, // Scheduling for lowest register pressure. diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h index a05d1a4..55e3202 100644 --- a/llvm/include/llvm/IR/CallingConv.h +++ b/llvm/include/llvm/IR/CallingConv.h @@ -267,6 +267,9 @@ namespace CallingConv { /// Calling convention used for RISC-V V-extension. RISCV_VectorCall = 110, + /// Preserve X1-X15, X19-X29, SP, Z0-Z31, P0-P15. + AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 = 111, + /// The highest possible ID. Must be some 2^k - 1. MaxID = 1023 }; diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 9dd1bb4..ad649b5 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1588,6 +1588,14 @@ public: static CallBase *removeOperandBundle(CallBase *CB, uint32_t ID, BasicBlock::iterator InsertPt); + /// Return the convergence control token for this call, if it exists. + Value *getConvergenceControlToken() const { + if (auto Bundle = getOperandBundle(llvm::LLVMContext::OB_convergencectrl)) { + return Bundle->Inputs[0].get(); + } + return nullptr; + } + static bool classof(const Instruction *I) { return I->getOpcode() == Instruction::Call || I->getOpcode() == Instruction::Invoke || @@ -2120,6 +2128,15 @@ public: return Attrs.getParamStackAlignment(ArgNo); } + /// Extract the byref type for a call or parameter. + Type *getParamByRefType(unsigned ArgNo) const { + if (auto *Ty = Attrs.getParamByRefType(ArgNo)) + return Ty; + if (const Function *F = getCalledFunction()) + return F->getAttributes().getParamByRefType(ArgNo); + return nullptr; + } + /// Extract the byval type for a call or parameter. Type *getParamByValType(unsigned ArgNo) const { if (auto *Ty = Attrs.getParamByValType(ArgNo)) diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h index fcd3a10..9010e1a 100644 --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -1799,17 +1799,14 @@ public: return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V)); } - // Returns the convergence intrinsic referenced by |I|'s convergencectrl - // attribute if any. - static IntrinsicInst *getParentConvergenceToken(Instruction *I) { - auto *CI = dyn_cast<llvm::CallInst>(I); - if (!CI) - return nullptr; - - auto Bundle = CI->getOperandBundle(llvm::LLVMContext::OB_convergencectrl); - assert(Bundle->Inputs.size() == 1 && - Bundle->Inputs[0]->getType()->isTokenTy()); - return dyn_cast<llvm::IntrinsicInst>(Bundle->Inputs[0].get()); + bool isAnchor() { + return getIntrinsicID() == Intrinsic::experimental_convergence_anchor; + } + bool isEntry() { + return getIntrinsicID() == Intrinsic::experimental_convergence_entry; + } + bool isLoop() { + return getIntrinsicID() == Intrinsic::experimental_convergence_loop; } }; diff --git a/llvm/include/llvm/IR/Operator.h b/llvm/include/llvm/IR/Operator.h index fda2689..f63f54e 100644 --- a/llvm/include/llvm/IR/Operator.h +++ b/llvm/include/llvm/IR/Operator.h @@ -330,8 +330,6 @@ public: unsigned Opcode; if (auto *I = dyn_cast<Instruction>(V)) Opcode = I->getOpcode(); - else if (auto *CE = dyn_cast<ConstantExpr>(V)) - Opcode = CE->getOpcode(); else return false; diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index a5a7288..e900bcd 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -197,6 +197,11 @@ HANDLE_LIBCALL(COS_F64, "cos") HANDLE_LIBCALL(COS_F80, "cosl") HANDLE_LIBCALL(COS_F128, "cosl") HANDLE_LIBCALL(COS_PPCF128, "cosl") +HANDLE_LIBCALL(TAN_F32, "tanf") +HANDLE_LIBCALL(TAN_F64, "tan") +HANDLE_LIBCALL(TAN_F80, "tanl") +HANDLE_LIBCALL(TAN_F128,"tanl") +HANDLE_LIBCALL(TAN_PPCF128, "tanl") HANDLE_LIBCALL(SINCOS_F32, nullptr) HANDLE_LIBCALL(SINCOS_F64, nullptr) HANDLE_LIBCALL(SINCOS_F80, nullptr) diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index c4c1825..8803ef5 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -106,6 +106,7 @@ void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&); void initializeExpandMemCmpLegacyPassPass(PassRegistry &); void initializeExpandPostRAPass(PassRegistry&); void initializeExpandReductionsPass(PassRegistry&); +void initializeExpandVariadicsPass(PassRegistry &); void initializeExpandVectorPredicationPass(PassRegistry &); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); diff --git a/llvm/include/llvm/MC/MCSymbolWasm.h b/llvm/include/llvm/MC/MCSymbolWasm.h index 0ce95c7..0c2b97a 100644 --- a/llvm/include/llvm/MC/MCSymbolWasm.h +++ b/llvm/include/llvm/MC/MCSymbolWasm.h @@ -114,9 +114,11 @@ public: return isTable() && hasTableType() && getTableType().ElemType == wasm::ValType::FUNCREF; } - void setFunctionTable() { + void setFunctionTable(bool is64) { setType(wasm::WASM_SYMBOL_TYPE_TABLE); - setTableType(wasm::ValType::FUNCREF); + uint8_t flags = + is64 ? wasm::WASM_LIMITS_FLAG_IS_64 : wasm::WASM_LIMITS_FLAG_NONE; + setTableType(wasm::ValType::FUNCREF, flags); } void setUsedInGOT() const { IsUsedInGOT = true; } @@ -140,10 +142,11 @@ public: return *TableType; } void setTableType(wasm::WasmTableType TT) { TableType = TT; } - void setTableType(wasm::ValType VT) { + void setTableType(wasm::ValType VT, + uint8_t flags = wasm::WASM_LIMITS_FLAG_NONE) { // Declare a table with element type VT and no limits (min size 0, no max // size). - wasm::WasmLimits Limits = {wasm::WASM_LIMITS_FLAG_NONE, 0, 0}; + wasm::WasmLimits Limits = {flags, 0, 0}; setTableType({VT, Limits}); } }; diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h index 406144d..528abe1 100644 --- a/llvm/include/llvm/ProfileData/MemProf.h +++ b/llvm/include/llvm/ProfileData/MemProf.h @@ -199,7 +199,7 @@ struct Frame { GlobalValue::GUID Function; // The symbol name for the function. Only populated in the Frame by the reader // if requested during initialization. This field should not be serialized. - std::optional<std::string> SymbolName; + std::unique_ptr<std::string> SymbolName; // The source line offset of the call from the beginning of parent function. uint32_t LineOffset; // The source column number of the call to help distinguish multiple calls @@ -210,7 +210,9 @@ struct Frame { Frame(const Frame &Other) { Function = Other.Function; - SymbolName = Other.SymbolName; + SymbolName = Other.SymbolName + ? std::make_unique<std::string>(*Other.SymbolName) + : nullptr; LineOffset = Other.LineOffset; Column = Other.Column; IsInlineFrame = Other.IsInlineFrame; @@ -228,7 +230,9 @@ struct Frame { Frame &operator=(const Frame &Other) { Function = Other.Function; - SymbolName = Other.SymbolName; + SymbolName = Other.SymbolName + ? std::make_unique<std::string>(*Other.SymbolName) + : nullptr; LineOffset = Other.LineOffset; Column = Other.Column; IsInlineFrame = Other.IsInlineFrame; @@ -237,10 +241,10 @@ struct Frame { bool operator!=(const Frame &Other) const { return !operator==(Other); } - bool hasSymbolName() const { return SymbolName.has_value(); } + bool hasSymbolName() const { return !!SymbolName; } StringRef getSymbolName() const { - assert(SymbolName.has_value()); + assert(hasSymbolName()); return *SymbolName; } diff --git a/llvm/include/llvm/Support/Error.h b/llvm/include/llvm/Support/Error.h index 662c3ea4..1fa0d8c 100644 --- a/llvm/include/llvm/Support/Error.h +++ b/llvm/include/llvm/Support/Error.h @@ -1278,6 +1278,11 @@ inline Error createStringError(const Twine &S) { } template <typename... Ts> +inline Error createStringError(char const *Fmt, const Ts &...Vals) { + return createStringError(llvm::inconvertibleErrorCode(), Fmt, Vals...); +} + +template <typename... Ts> inline Error createStringError(std::errc EC, char const *Fmt, const Ts &... Vals) { return createStringError(std::make_error_code(EC), Fmt, Vals...); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 1ea2652..bd43b95 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1605,6 +1605,37 @@ def insert_vector_elt_oob : GICombineRule< [{ return Helper.matchInsertVectorElementOOB(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; +def add_of_vscale : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_VSCALE $left, $imm1), + (G_VSCALE $right, $imm2), + (G_ADD $root, $left, $right, (MIFlags NoSWrap)), + [{ return Helper.matchAddOfVScale(${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; + +def mul_of_vscale : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_VSCALE $left, $scale), + (G_CONSTANT $x, $imm1), + (G_MUL $root, $left, $x, (MIFlags NoSWrap)), + [{ return Helper.matchMulOfVScale(${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; + +def shl_of_vscale : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_VSCALE $left, $imm), + (G_CONSTANT $x, $imm1), + (G_SHL $root, $left, $x, (MIFlags NoSWrap)), + [{ return Helper.matchShlOfVScale(${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; + +def sub_of_vscale : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (G_VSCALE $right, $imm), + (G_SUB $root, $x, $right, (MIFlags NoSWrap)), + [{ return Helper.matchSubOfVScale(${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFnMO(${root}, ${matchinfo}); }])>; + // match_extract_of_element and insert_vector_elt_oob must be the first! def vector_ops_combines: GICombineGroup<[ match_extract_of_element_undef_vector, @@ -1637,7 +1668,11 @@ extract_vector_element_build_vector_trunc6, extract_vector_element_build_vector_trunc7, extract_vector_element_build_vector_trunc8, extract_vector_element_shuffle_vector, -insert_vector_element_extract_vector_element +insert_vector_element_extract_vector_element, +add_of_vscale, +mul_of_vscale, +shl_of_vscale, +sub_of_vscale, ]>; diff --git a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td index 8fa0e4b..560d3b4 100644 --- a/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td +++ b/llvm/include/llvm/Target/GlobalISel/SelectionDAGCompat.td @@ -148,6 +148,7 @@ def : GINodeEquiv<G_BUILD_VECTOR, build_vector>; def : GINodeEquiv<G_FCEIL, fceil>; def : GINodeEquiv<G_FCOS, fcos>; def : GINodeEquiv<G_FSIN, fsin>; +def : GINodeEquiv<G_FTAN, ftan>; def : GINodeEquiv<G_FABS, fabs>; def : GINodeEquiv<G_FSQRT, fsqrt>; def : GINodeEquiv<G_FFLOOR, ffloor>; diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td index 1c95a60..15e02eb 100644 --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -509,6 +509,7 @@ def fneg : SDNode<"ISD::FNEG" , SDTFPUnaryOp>; def fsqrt : SDNode<"ISD::FSQRT" , SDTFPUnaryOp>; def fsin : SDNode<"ISD::FSIN" , SDTFPUnaryOp>; def fcos : SDNode<"ISD::FCOS" , SDTFPUnaryOp>; +def ftan : SDNode<"ISD::FTAN" , SDTFPUnaryOp>; def fexp2 : SDNode<"ISD::FEXP2" , SDTFPUnaryOp>; def fexp10 : SDNode<"ISD::FEXP10" , SDTFPUnaryOp>; def fpow : SDNode<"ISD::FPOW" , SDTFPBinOp>; @@ -562,6 +563,8 @@ def strict_fsin : SDNode<"ISD::STRICT_FSIN", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fcos : SDNode<"ISD::STRICT_FCOS", SDTFPUnaryOp, [SDNPHasChain]>; +def strict_ftan : SDNode<"ISD::STRICT_FTAN", + SDTFPUnaryOp, [SDNPHasChain]>; def strict_fexp2 : SDNode<"ISD::STRICT_FEXP2", SDTFPUnaryOp, [SDNPHasChain]>; def strict_fpow : SDNode<"ISD::STRICT_FPOW", @@ -1517,6 +1520,9 @@ def any_fsin : PatFrags<(ops node:$src), def any_fcos : PatFrags<(ops node:$src), [(strict_fcos node:$src), (fcos node:$src)]>; +def any_ftan : PatFrags<(ops node:$src), + [(strict_ftan node:$src), + (ftan node:$src)]>; def any_fexp2 : PatFrags<(ops node:$src), [(strict_fexp2 node:$src), (fexp2 node:$src)]>; diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 5025ab2..afe6789 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -577,6 +577,11 @@ inline constexpr CpuInfo CpuInfos[] = { AArch64::AEK_SHA2, AArch64::AEK_AES, AArch64::AEK_MTE, AArch64::AEK_SB, AArch64::AEK_SSBS, AArch64::AEK_CSSC})}, + {"oryon-1", ARMV8_6A, + (AArch64::ExtensionBitset({AArch64::AEK_AES, AArch64::AEK_CRYPTO, + AArch64::AEK_RAND, AArch64::AEK_SM4, + AArch64::AEK_SHA3, AArch64::AEK_SHA2, + AArch64::AEK_PROFILE}))}, }; // Name alias. diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index 8fc6fa3..e03d8f6 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -105,6 +105,7 @@ enum GPUKind : uint32_t { GK_GFX1103 = 93, GK_GFX1150 = 94, GK_GFX1151 = 95, + GK_GFX1152 = 96, GK_GFX1200 = 100, GK_GFX1201 = 101, diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h index d3d3a9c..6ba04db 100644 --- a/llvm/include/llvm/Transforms/IPO/Attributor.h +++ b/llvm/include/llvm/Transforms/IPO/Attributor.h @@ -5143,9 +5143,7 @@ struct DenormalFPMathState : public AbstractState { return Mode != Other.Mode || ModeF32 != Other.ModeF32; } - bool isValid() const { - return Mode.isValid() && ModeF32.isValid(); - } + bool isValid() const { return Mode.isValid() && ModeF32.isValid(); } static DenormalMode::DenormalModeKind unionDenormalKind(DenormalMode::DenormalModeKind Callee, @@ -5185,9 +5183,7 @@ struct DenormalFPMathState : public AbstractState { // state. DenormalState getAssumed() const { return Known; } - bool isValidState() const override { - return Known.isValid(); - } + bool isValidState() const override { return Known.isValid(); } /// Return true if there are no dynamic components to the denormal mode worth /// specializing. @@ -5198,9 +5194,7 @@ struct DenormalFPMathState : public AbstractState { Known.ModeF32.Output != DenormalMode::Dynamic; } - bool isAtFixpoint() const override { - return IsAtFixedpoint; - } + bool isAtFixpoint() const override { return IsAtFixedpoint; } ChangeStatus indicateFixpoint() { bool Changed = !IsAtFixedpoint; diff --git a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h new file mode 100644 index 0000000..4c5a1b6 --- /dev/null +++ b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h @@ -0,0 +1,40 @@ +//===- ExpandVariadics.h - expand variadic functions ------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H +#define LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class Module; +class ModulePass; +class OptimizationLevel; + +enum class ExpandVariadicsMode { + Unspecified, // Use the implementation defaults + Disable, // Disable the pass entirely + Optimize, // Optimise without changing ABI + Lowering, // Change variadic calling convention +}; + +class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> { + const ExpandVariadicsMode Mode; + +public: + // Operates under passed mode unless overridden on commandline + ExpandVariadicsPass(ExpandVariadicsMode Mode); + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM); +}; + +ModulePass *createExpandVariadicsPass(ExpandVariadicsMode); + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H diff --git a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h index bd804dc..797c082 100644 --- a/llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ b/llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -16,6 +16,7 @@ #define LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H #include "llvm/ADT/DenseMap.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Support/InstructionCost.h" @@ -73,6 +74,7 @@ struct UnrollLoopOptions { bool AllowExpensiveTripCount; bool UnrollRemainder; bool ForgetAllSCEV; + const Instruction *Heart = nullptr; }; LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, @@ -128,14 +130,15 @@ class UnrollCostEstimator { public: unsigned NumInlineCandidates; - bool Convergent; + ConvergenceKind Convergence; + bool ConvergenceAllowsRuntime; UnrollCostEstimator(const Loop *L, const TargetTransformInfo &TTI, const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns); /// Whether it is legal to unroll this loop. - bool canUnroll() const { return LoopSize.isValid() && !NotDuplicatable; } + bool canUnroll() const; uint64_t getRolledLoopSize() const { return *LoopSize.getValue(); } diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp index 2637e2f..ea67b52 100644 --- a/llvm/lib/Analysis/CodeMetrics.cpp +++ b/llvm/lib/Analysis/CodeMetrics.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" #include "llvm/Support/InstructionCost.h" @@ -111,11 +112,24 @@ void CodeMetrics::collectEphemeralValues( completeEphemeralValues(Visited, Worklist, EphValues); } +static bool extendsConvergenceOutsideLoop(const Instruction &I, const Loop *L) { + if (!L) + return false; + if (!isa<ConvergenceControlInst>(I)) + return false; + for (const auto *U : I.users()) { + if (!L->contains(cast<Instruction>(U))) + return true; + } + return false; +} + /// Fill in the current structure with information gleaned from the specified /// block. void CodeMetrics::analyzeBasicBlock( const BasicBlock *BB, const TargetTransformInfo &TTI, - const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO) { + const SmallPtrSetImpl<const Value *> &EphValues, bool PrepareForLTO, + const Loop *L) { ++NumBlocks; InstructionCost NumInstsBeforeThisBB = NumInsts; for (const Instruction &I : *BB) { @@ -163,19 +177,38 @@ void CodeMetrics::analyzeBasicBlock( if (isa<ExtractElementInst>(I) || I.getType()->isVectorTy()) ++NumVectorInsts; - if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB)) + if (I.getType()->isTokenTy() && !isa<ConvergenceControlInst>(I) && + I.isUsedOutsideOfBlock(BB)) { + LLVM_DEBUG(dbgs() << I + << "\n Cannot duplicate a token value used outside " + "the current block (except convergence control).\n"); notDuplicatable = true; - - if (const CallInst *CI = dyn_cast<CallInst>(&I)) { - if (CI->cannotDuplicate()) - notDuplicatable = true; - if (CI->isConvergent()) - convergent = true; } - if (const InvokeInst *InvI = dyn_cast<InvokeInst>(&I)) - if (InvI->cannotDuplicate()) + if (const CallBase *CB = dyn_cast<CallBase>(&I)) { + if (CB->cannotDuplicate()) notDuplicatable = true; + // Compute a meet over the visited blocks for the following partial order: + // + // None -> { Controlled, ExtendedLoop, Uncontrolled} + // Controlled -> ExtendedLoop + if (Convergence <= ConvergenceKind::Controlled && CB->isConvergent()) { + if (isa<ConvergenceControlInst>(CB) || + CB->getConvergenceControlToken()) { + assert(Convergence != ConvergenceKind::Uncontrolled); + LLVM_DEBUG(dbgs() << "Found controlled convergence:\n" << I << "\n"); + if (extendsConvergenceOutsideLoop(I, L)) + Convergence = ConvergenceKind::ExtendedLoop; + else { + assert(Convergence != ConvergenceKind::ExtendedLoop); + Convergence = ConvergenceKind::Controlled; + } + } else { + assert(Convergence == ConvergenceKind::None); + Convergence = ConvergenceKind::Uncontrolled; + } + } + } NumInsts += TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize); } diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 369ab08..c34c497 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -1105,6 +1105,26 @@ int llvm::getIntLoopAttribute(const Loop *TheLoop, StringRef Name, return getOptionalIntLoopAttribute(TheLoop, Name).value_or(Default); } +CallBase *llvm::getLoopConvergenceHeart(const Loop *TheLoop) { + BasicBlock *H = TheLoop->getHeader(); + for (Instruction &II : *H) { + if (auto *CB = dyn_cast<CallBase>(&II)) { + if (!CB->isConvergent()) + continue; + // This is the heart if it uses a token defined outside the loop. The + // verifier has already checked that only the loop intrinsic can use such + // a token. + if (auto *Token = CB->getConvergenceControlToken()) { + auto *TokenDef = cast<Instruction>(Token); + if (!TheLoop->contains(TokenDef->getParent())) + return CB; + } + return nullptr; + } + } + return nullptr; +} + bool llvm::isFinite(const Loop *L) { return L->getHeader()->getParent()->willReturn(); } diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 08138a5..782c28c 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -7296,10 +7296,13 @@ static bool isGuaranteedNotToBeUndefOrPoison( isa<ConstantPointerNull>(C) || isa<Function>(C)) return true; - if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) - return (!includesUndef(Kind) ? !C->containsPoisonElement() - : !C->containsUndefOrPoisonElement()) && - !C->containsConstantExpression(); + if (C->getType()->isVectorTy() && !isa<ConstantExpr>(C)) { + if (includesUndef(Kind) && C->containsUndefElement()) + return false; + if (includesPoison(Kind) && C->containsPoisonElement()) + return false; + return !C->containsConstantExpression(); + } } // Strip cast operations from a pointer value. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 9170942..30728ed 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -68,6 +68,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) { case Intrinsic::sqrt: // Begin floating-point. case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::tan: case Intrinsic::exp: case Intrinsic::exp2: case Intrinsic::log: diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index d3ab306..7d7fe19 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -604,6 +604,7 @@ lltok::Kind LLLexer::LexIdentifier() { KEYWORD(aarch64_vector_pcs); KEYWORD(aarch64_sve_vector_pcs); KEYWORD(aarch64_sme_preservemost_from_x0); + KEYWORD(aarch64_sme_preservemost_from_x1); KEYWORD(aarch64_sme_preservemost_from_x2); KEYWORD(msp430_intrcc); KEYWORD(avr_intrcc); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index 07c8aa2..f0fde9a 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -2153,6 +2153,7 @@ void LLParser::parseOptionalDLLStorageClass(unsigned &Res) { /// ::= 'aarch64_vector_pcs' /// ::= 'aarch64_sve_vector_pcs' /// ::= 'aarch64_sme_preservemost_from_x0' +/// ::= 'aarch64_sme_preservemost_from_x1' /// ::= 'aarch64_sme_preservemost_from_x2' /// ::= 'msp430_intrcc' /// ::= 'avr_intrcc' @@ -2212,6 +2213,9 @@ bool LLParser::parseOptionalCallingConv(unsigned &CC) { case lltok::kw_aarch64_sme_preservemost_from_x0: CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0; break; + case lltok::kw_aarch64_sme_preservemost_from_x1: + CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1; + break; case lltok::kw_aarch64_sme_preservemost_from_x2: CC = CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2; break; diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp index b4765fb..66b1c5f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelperVectorOps.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT. +// This file implements CombinerHelper for G_EXTRACT_VECTOR_ELT, +// G_INSERT_VECTOR_ELT, and G_VSCALE // //===----------------------------------------------------------------------===// #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -400,3 +401,86 @@ bool CombinerHelper::matchInsertVectorElementOOB(MachineInstr &MI, return false; } + +bool CombinerHelper::matchAddOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GAdd *Add = cast<GAdd>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getLHSReg())); + GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Add->getRHSReg())); + + Register Dst = Add->getReg(0); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) || + !MRI.hasOneNonDBGUse(RHSVScale->getReg(0))) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc() + RHSVScale->getSrc()); + }; + + return true; +} + +bool CombinerHelper::matchMulOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GMul *Mul = cast<GMul>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Mul->getLHSReg())); + + std::optional<APInt> MaybeRHS = getIConstantVRegVal(Mul->getRHSReg(), MRI); + if (!MaybeRHS) + return false; + + Register Dst = MO.getReg(); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0))) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc() * *MaybeRHS); + }; + + return true; +} + +bool CombinerHelper::matchSubOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GSub *Sub = cast<GSub>(MRI.getVRegDef(MO.getReg())); + GVScale *RHSVScale = cast<GVScale>(MRI.getVRegDef(Sub->getRHSReg())); + + Register Dst = MO.getReg(); + LLT DstTy = MRI.getType(Dst); + + if (!MRI.hasOneNonDBGUse(RHSVScale->getReg(0)) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_ADD, DstTy})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + auto VScale = B.buildVScale(DstTy, -RHSVScale->getSrc()); + B.buildAdd(Dst, Sub->getLHSReg(), VScale, Sub->getFlags()); + }; + + return true; +} + +bool CombinerHelper::matchShlOfVScale(const MachineOperand &MO, + BuildFnTy &MatchInfo) { + GShl *Shl = cast<GShl>(MRI.getVRegDef(MO.getReg())); + GVScale *LHSVScale = cast<GVScale>(MRI.getVRegDef(Shl->getSrcReg())); + + std::optional<APInt> MaybeRHS = getIConstantVRegVal(Shl->getShiftReg(), MRI); + if (!MaybeRHS) + return false; + + Register Dst = MO.getReg(); + LLT DstTy = MRI.getType(Dst); + + if (!MRI.hasOneNonDBGUse(LHSVScale->getReg(0)) || + !isLegalOrBeforeLegalizer({TargetOpcode::G_VSCALE, DstTy})) + return false; + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildVScale(Dst, LHSVScale->getSrc().shl(*MaybeRHS)); + }; + + return true; +} diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp index 6f0cae2..9830b52 100644 --- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -449,6 +449,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) { RTLIBCASE(SIN_F); case TargetOpcode::G_FCOS: RTLIBCASE(COS_F); + case TargetOpcode::G_FTAN: + RTLIBCASE(TAN_F); case TargetOpcode::G_FLOG10: RTLIBCASE(LOG10_F); case TargetOpcode::G_FLOG: @@ -1037,6 +1039,7 @@ LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) { case TargetOpcode::G_FREM: case TargetOpcode::G_FCOS: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: @@ -2893,6 +2896,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) { case TargetOpcode::G_FFLOOR: case TargetOpcode::G_FCOS: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FLOG10: case TargetOpcode::G_FLOG: case TargetOpcode::G_FLOG2: @@ -4659,6 +4663,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx, case G_INTRINSIC_TRUNC: case G_FCOS: case G_FSIN: + case G_FTAN: case G_FSQRT: case G_BSWAP: case G_BITREVERSE: diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp index e8438be..129e696 100644 --- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp +++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp @@ -833,6 +833,7 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI, case TargetOpcode::G_FREM: case TargetOpcode::G_FSIN: case TargetOpcode::G_FCOS: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FMA: case TargetOpcode::G_FMAD: if (SNaN) @@ -1713,6 +1714,7 @@ bool llvm::isPreISelGenericFloatingPointOpcode(unsigned Opc) { case TargetOpcode::G_FREM: case TargetOpcode::G_FRINT: case TargetOpcode::G_FSIN: + case TargetOpcode::G_FTAN: case TargetOpcode::G_FSQRT: case TargetOpcode::G_FSUB: case TargetOpcode::G_INTRINSIC_ROUND: diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp index 3397bd0..a808a54 100644 --- a/llvm/lib/CodeGen/RegisterCoalescer.cpp +++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -1339,14 +1339,13 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP, if (SrcIdx && DstIdx) return false; - [[maybe_unused]] const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); + const unsigned DefSubIdx = DefMI->getOperand(0).getSubReg(); const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF); if (!DefMI->isImplicitDef()) { if (DstReg.isPhysical()) { Register NewDstReg = DstReg; - unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), - DefMI->getOperand(0).getSubReg()); + unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(), DefSubIdx); if (NewDstIdx) NewDstReg = TRI->getSubReg(DstReg, NewDstIdx); diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 9a53590..02cd125 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -4041,17 +4041,11 @@ SDValue DAGCombiner::visitSUB(SDNode *N) { return DAG.getNode(ISD::ADD, DL, VT, N0, SExt); } - // fold Y = sra (X, size(X)-1); sub (xor (X, Y), Y) -> (abs X) - if (TLI.isOperationLegalOrCustom(ISD::ABS, VT)) { - if (N0.getOpcode() == ISD::XOR && N1.getOpcode() == ISD::SRA) { - SDValue X0 = N0.getOperand(0), X1 = N0.getOperand(1); - SDValue S0 = N1.getOperand(0); - if ((X0 == S0 && X1 == N1) || (X0 == N1 && X1 == S0)) - if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1))) - if (C->getAPIntValue() == (BitWidth - 1)) - return DAG.getNode(ISD::ABS, DL, VT, S0); - } - } + // fold B = sra (A, size(A)-1); sub (xor (A, B), B) -> (abs A) + if (hasOperation(ISD::ABS, VT) && + sd_match(N1, m_Sra(m_Value(A), m_SpecificInt(BitWidth - 1))) && + sd_match(N0, m_Xor(m_Specific(A), m_Specific(N1)))) + return DAG.getNode(ISD::ABS, DL, VT, A); // If the relocation model supports it, consider symbol offsets. if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0)) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 8cd2bb6..27c45ca 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -4514,6 +4514,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) { RTLIB::COS_F80, RTLIB::COS_F128, RTLIB::COS_PPCF128, Results); break; + case ISD::FTAN: + case ISD::STRICT_FTAN: + ExpandFPLibCall(Node, RTLIB::TAN_F32, RTLIB::TAN_F64, RTLIB::TAN_F80, + RTLIB::TAN_F128, RTLIB::TAN_PPCF128, Results); + break; case ISD::FSINCOS: // Expand into sincos libcall. ExpandSinCosLibCall(Node, Results); @@ -5468,6 +5473,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FLOG: case ISD::FLOG2: case ISD::FLOG10: @@ -5492,6 +5498,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { case ISD::STRICT_FSQRT: case ISD::STRICT_FSIN: case ISD::STRICT_FCOS: + case ISD::STRICT_FTAN: case ISD::STRICT_FLOG: case ISD::STRICT_FLOG2: case ISD::STRICT_FLOG10: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index fb1424f..aa116c9 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -131,6 +131,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSQRT: R = SoftenFloatRes_FSQRT(N); break; case ISD::STRICT_FSUB: case ISD::FSUB: R = SoftenFloatRes_FSUB(N); break; + case ISD::STRICT_FTAN: + case ISD::FTAN: R = SoftenFloatRes_FTAN(N); break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: R = SoftenFloatRes_FTRUNC(N); break; case ISD::LOAD: R = SoftenFloatRes_LOAD(N); break; @@ -774,6 +776,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) { RTLIB::SUB_PPCF128)); } +SDValue DAGTypeLegalizer::SoftenFloatRes_FTAN(SDNode *N) { + return SoftenFloatRes_Unary( + N, GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, RTLIB::TAN_F64, + RTLIB::TAN_F80, RTLIB::TAN_F128, RTLIB::TAN_PPCF128)); +} + SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) { return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::TRUNC_F32, @@ -1330,7 +1338,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { #endif report_fatal_error("Do not know how to expand the result of this " "operator!"); - + // clang-format off case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; @@ -1399,6 +1407,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSQRT: ExpandFloatRes_FSQRT(N, Lo, Hi); break; case ISD::STRICT_FSUB: case ISD::FSUB: ExpandFloatRes_FSUB(N, Lo, Hi); break; + case ISD::STRICT_FTAN: + case ISD::FTAN: ExpandFloatRes_FTAN(N, Lo, Hi); break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: ExpandFloatRes_FTRUNC(N, Lo, Hi); break; case ISD::LOAD: ExpandFloatRes_LOAD(N, Lo, Hi); break; @@ -1408,6 +1418,7 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) { case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break; case ISD::STRICT_FREM: case ISD::FREM: ExpandFloatRes_FREM(N, Lo, Hi); break; + // clang-format on } // If Lo/Hi is null, the sub-method took care of registering results etc. @@ -1768,6 +1779,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo, RTLIB::SUB_PPCF128), Lo, Hi); } +void DAGTypeLegalizer::ExpandFloatRes_FTAN(SDNode *N, SDValue &Lo, + SDValue &Hi) { + ExpandFloatRes_Unary(N, + GetFPLibCall(N->getValueType(0), RTLIB::TAN_F32, + RTLIB::TAN_F64, RTLIB::TAN_F80, + RTLIB::TAN_F128, RTLIB::TAN_PPCF128), + Lo, Hi); +} + void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N, SDValue &Lo, SDValue &Hi) { ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), @@ -2479,6 +2499,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) { case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: + case ISD::FTAN: case ISD::FCANONICALIZE: R = PromoteFloatRes_UnaryOp(N); break; // Binary FP Operations @@ -2914,6 +2935,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FSIN: case ISD::FSQRT: case ISD::FTRUNC: + case ISD::FTAN: case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break; // Binary FP Operations diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index bec9cb4..2350b56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -586,6 +586,7 @@ private: SDValue SoftenFloatRes_FSIN(SDNode *N); SDValue SoftenFloatRes_FSQRT(SDNode *N); SDValue SoftenFloatRes_FSUB(SDNode *N); + SDValue SoftenFloatRes_FTAN(SDNode *N); SDValue SoftenFloatRes_FTRUNC(SDNode *N); SDValue SoftenFloatRes_LOAD(SDNode *N); SDValue SoftenFloatRes_ATOMIC_LOAD(SDNode *N); @@ -635,6 +636,7 @@ private: SDValue &Lo, SDValue &Hi); void ExpandFloatRes_Binary(SDNode *N, RTLIB::Libcall LC, SDValue &Lo, SDValue &Hi); + // clang-format off void ExpandFloatRes_FABS (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMINNUM (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FMAXNUM (SDNode *N, SDValue &Lo, SDValue &Hi); @@ -667,9 +669,11 @@ private: void ExpandFloatRes_FSIN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSQRT (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FSUB (SDNode *N, SDValue &Lo, SDValue &Hi); + void ExpandFloatRes_FTAN (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_FTRUNC (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_LOAD (SDNode *N, SDValue &Lo, SDValue &Hi); void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi); + // clang-format on // Float Operand Expansion. bool ExpandFloatOperand(SDNode *N, unsigned OpNo); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 6acbc04..8cdb4ba 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -397,6 +397,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { case ISD::FSQRT: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FLDEXP: case ISD::FPOWI: case ISD::FPOW: @@ -506,7 +507,8 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) { break; \ } \ /* Defer non-vector results to LegalizeDAG. */ \ - if (!Node->getValueType(0).isVector()) { \ + if (!Node->getValueType(0).isVector() && \ + Node->getValueType(0) != MVT::Other) { \ Action = TargetLowering::Legal; \ break; \ } \ @@ -990,11 +992,8 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) { break; case ISD::FMINIMUM: case ISD::FMAXIMUM: - if (SDValue Expanded = TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)) { - Results.push_back(Expanded); - return; - } - break; + Results.push_back(TLI.expandFMINIMUM_FMAXIMUM(Node, DAG)); + return; case ISD::SMIN: case ISD::SMAX: case ISD::UMIN: diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 361416e..92ce3b1 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -108,6 +108,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) { case ISD::FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: + case ISD::FTAN: case ISD::FTRUNC: case ISD::SIGN_EXTEND: case ISD::SINT_TO_FP: @@ -1140,6 +1141,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) { case ISD::VP_FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: case ISD::VP_SQRT: + case ISD::FTAN: case ISD::FTRUNC: case ISD::VP_FROUNDTOZERO: case ISD::SINT_TO_FP: @@ -4400,6 +4402,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) { case ISD::FROUNDEVEN: case ISD::FSIN: case ISD::FSQRT: + case ISD::FTAN: case ISD::FTRUNC: if (unrollExpandedOp()) break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4a6a431..e176cf2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5375,6 +5375,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const case ISD::FREM: case ISD::FSIN: case ISD::FCOS: + case ISD::FTAN: case ISD::FMA: case ISD::FMAD: { if (SNaN) @@ -6332,7 +6333,8 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) { } SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, - EVT VT, ArrayRef<SDValue> Ops) { + EVT VT, ArrayRef<SDValue> Ops, + SDNodeFlags Flags) { // If the opcode is a target-specific ISD node, there's nothing we can // do here and the operand rules may not line up with the below, so // bail early. @@ -6689,7 +6691,7 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL, } // Constant fold the scalar operands. - SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps); + SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags); // Legalize the (integer) scalar constant if necessary. if (LegalSVT != SVT) @@ -7260,7 +7262,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, } // Perform trivial constant folding. - if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2})) + if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}, Flags)) return SV; // Canonicalize an UNDEF to the RHS, even over a constant. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index ba76456..2f3626f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1684,7 +1684,7 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values, if (!FragmentExpr) continue; SDDbgValue *SDV = DAG.getVRegDbgValue( - Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, SDNodeOrder); + Var, *FragmentExpr, RegAndSize.first, false, DbgLoc, Order); DAG.AddDbgValue(SDV, false); Offset += RegisterSize; } @@ -1699,11 +1699,10 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values, } // We have created a SDDbgOperand for each Value in Values. - // Should use Order instead of SDNodeOrder? assert(!LocationOps.empty()); - SDDbgValue *SDV = DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies, - /*IsIndirect=*/false, DbgLoc, - SDNodeOrder, IsVariadic); + SDDbgValue *SDV = + DAG.getDbgValueList(Var, Expr, LocationOps, Dependencies, + /*IsIndirect=*/false, DbgLoc, Order, IsVariadic); DAG.AddDbgValue(SDV, /*isParameter=*/false); return true; } @@ -6742,6 +6741,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::fabs: case Intrinsic::sin: case Intrinsic::cos: + case Intrinsic::tan: case Intrinsic::exp10: case Intrinsic::floor: case Intrinsic::ceil: @@ -6759,6 +6759,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, case Intrinsic::fabs: Opcode = ISD::FABS; break; case Intrinsic::sin: Opcode = ISD::FSIN; break; case Intrinsic::cos: Opcode = ISD::FCOS; break; + case Intrinsic::tan: Opcode = ISD::FTAN; break; case Intrinsic::exp10: Opcode = ISD::FEXP10; break; case Intrinsic::floor: Opcode = ISD::FFLOOR; break; case Intrinsic::ceil: Opcode = ISD::FCEIL; break; @@ -9160,6 +9161,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) { if (visitUnaryFloatCall(I, ISD::FCOS)) return; break; + case LibFunc_tan: + case LibFunc_tanf: + case LibFunc_tanl: + if (visitUnaryFloatCall(I, ISD::FTAN)) + return; + break; case LibFunc_sqrt: case LibFunc_sqrtf: case LibFunc_sqrtl: diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 2198c23..52da24b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -210,6 +210,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::FCOS: return "fcos"; case ISD::STRICT_FCOS: return "strict_fcos"; case ISD::FSINCOS: return "fsincos"; + case ISD::FTAN: return "ftan"; + case ISD::STRICT_FTAN: return "strict_ftan"; case ISD::FTRUNC: return "ftrunc"; case ISD::STRICT_FTRUNC: return "strict_ftrunc"; case ISD::FFLOOR: return "ffloor"; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f856c8a..e1c1a6b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8427,10 +8427,6 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, bool IsMax = Opc == ISD::FMAXIMUM; SDNodeFlags Flags = N->getFlags(); - if (VT.isVector() && - isOperationLegalOrCustomOrPromote(Opc, VT.getScalarType())) - return SDValue(); - // First, implement comparison not propagating NaN. If no native fmin or fmax // available, use plain select with setcc instead. SDValue MinMax; @@ -8447,6 +8443,9 @@ SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, } else if (isOperationLegalOrCustom(CompOpc, VT)) { MinMax = DAG.getNode(CompOpc, DL, VT, LHS, RHS, Flags); } else { + if (VT.isVector() && !isOperationLegalOrCustom(ISD::VSELECT, VT)) + return DAG.UnrollVectorOp(N); + // NaN (if exists) will be propagated later, so orderness doesn't matter. SDValue Compare = DAG.getSetCC(DL, CCVT, LHS, RHS, IsMax ? ISD::SETGT : ISD::SETLT); @@ -9159,6 +9158,7 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, if (!IsNegative && isOperationLegal(ISD::SUB, VT) && isOperationLegal(ISD::SMAX, VT)) { SDValue Zero = DAG.getConstant(0, dl, VT); + Op = DAG.getFreeze(Op); return DAG.getNode(ISD::SMAX, dl, VT, Op, DAG.getNode(ISD::SUB, dl, VT, Zero, Op)); } @@ -9175,8 +9175,8 @@ SDValue TargetLowering::expandABS(SDNode *N, SelectionDAG &DAG, // 0 - abs(x) -> smin(x, sub(0,x)) if (IsNegative && isOperationLegal(ISD::SUB, VT) && isOperationLegal(ISD::SMIN, VT)) { - Op = DAG.getFreeze(Op); SDValue Zero = DAG.getConstant(0, dl, VT); + Op = DAG.getFreeze(Op); return DAG.getNode(ISD::SMIN, dl, VT, Op, DAG.getNode(ISD::SUB, dl, VT, Zero, Op)); } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index 3aec704..8240a1f 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -141,6 +141,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) { setLibcallName(RTLIB::EXP10_F128, "exp10f128"); setLibcallName(RTLIB::SIN_F128, "sinf128"); setLibcallName(RTLIB::COS_F128, "cosf128"); + setLibcallName(RTLIB::TAN_F128, "tanf128"); setLibcallName(RTLIB::SINCOS_F128, "sincosf128"); setLibcallName(RTLIB::POW_F128, "powf128"); setLibcallName(RTLIB::POW_FINITE_F128, "__powf128_finite"); @@ -1015,7 +1016,8 @@ void TargetLoweringBase::initActions() { setOperationAction({ISD::FCBRT, ISD::FLOG, ISD::FLOG2, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC, ISD::LROUND, - ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN}, + ISD::LLROUND, ISD::LRINT, ISD::LLRINT, ISD::FROUNDEVEN, + ISD::FTAN}, {MVT::f32, MVT::f64, MVT::f128}, Expand); // Default ISD::TRAP to expand (which turns it into abort). diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 0046220..f44a6a4 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -1183,8 +1183,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID, StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { resolveRelocation(Section, Offset, - (uint64_t)Section.getAddressWithOffset(i->second), - RelType, 0); + Section.getLoadAddressWithOffset(i->second), RelType, 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) { // Create a new stub function. @@ -1217,8 +1216,7 @@ void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID, addRelocationForSection(REmovk_g0, Value.SectionID); } resolveRelocation(Section, Offset, - reinterpret_cast<uint64_t>(Section.getAddressWithOffset( - Section.getStubOffset())), + Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType, 0); Section.advanceStubOffset(getMaxStubSize()); } @@ -1349,10 +1347,9 @@ RuntimeDyldELF::processRelocationRef( // Look for an existing stub. StubMap::const_iterator i = Stubs.find(Value); if (i != Stubs.end()) { - resolveRelocation( - Section, Offset, - reinterpret_cast<uint64_t>(Section.getAddressWithOffset(i->second)), - RelType, 0); + resolveRelocation(Section, Offset, + Section.getLoadAddressWithOffset(i->second), RelType, + 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else { // Create a new stub function. @@ -1367,10 +1364,10 @@ RuntimeDyldELF::processRelocationRef( else addRelocationForSection(RE, Value.SectionID); - resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( - Section.getAddressWithOffset( - Section.getStubOffset())), - RelType, 0); + resolveRelocation( + Section, Offset, + Section.getLoadAddressWithOffset(Section.getStubOffset()), RelType, + 0); Section.advanceStubOffset(getMaxStubSize()); } } else { @@ -1609,8 +1606,7 @@ RuntimeDyldELF::processRelocationRef( if (i != Stubs.end()) { // Symbol function stub already created, just relocate to it resolveRelocation(Section, Offset, - reinterpret_cast<uint64_t>( - Section.getAddressWithOffset(i->second)), + Section.getLoadAddressWithOffset(i->second), RelType, 0); LLVM_DEBUG(dbgs() << " Stub function found\n"); } else { @@ -1652,10 +1648,10 @@ RuntimeDyldELF::processRelocationRef( addRelocationForSection(REl, Value.SectionID); } - resolveRelocation(Section, Offset, reinterpret_cast<uint64_t>( - Section.getAddressWithOffset( - Section.getStubOffset())), - RelType, 0); + resolveRelocation( + Section, Offset, + Section.getLoadAddressWithOffset(Section.getStubOffset()), + RelType, 0); Section.advanceStubOffset(getMaxStubSize()); } if (IsExtern || (AbiVariant == 2 && Value.SectionID != SectionID)) { diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp index 2c4b452..92213e1 100644 --- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp +++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp @@ -3961,7 +3961,7 @@ static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) { UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); // Loop is not unrollable if the loop contains certain instructions. - if (!UCE.canUnroll() || UCE.Convergent) { + if (!UCE.canUnroll()) { LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n"); return 1; } diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 7a5f18f..0bf8be9 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -326,6 +326,9 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out) { case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: Out << "aarch64_sme_preservemost_from_x0"; break; + case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1: + Out << "aarch64_sme_preservemost_from_x1"; + break; case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: Out << "aarch64_sme_preservemost_from_x2"; break; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index a7ed2de..2f4b835 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -5368,8 +5368,8 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) { return DL.empty() ? std::string("G1") : (DL + "-G1").str(); } - if (T.isRISCV64()) { - // Make i32 a native type for 64-bit RISC-V. + if (T.isLoongArch64() || T.isRISCV64()) { + // Make i32 a native type for 64-bit LoongArch and RISC-V. auto I = DL.find("-n64-"); if (I != StringRef::npos) return (DL.take_front(I) + "-n32:64-" + DL.drop_front(I + 5)).str(); diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp index 985f9351..788e92f 100644 --- a/llvm/lib/MC/WasmObjectWriter.cpp +++ b/llvm/lib/MC/WasmObjectWriter.cpp @@ -877,7 +877,7 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports, break; case wasm::WASM_EXTERNAL_TABLE: W->OS << char(Import.Table.ElemType); - encodeULEB128(0, W->OS); // flags + encodeULEB128(Import.Table.Limits.Flags, W->OS); encodeULEB128(NumElements, W->OS); // initial break; case wasm::WASM_EXTERNAL_TAG: @@ -1022,7 +1022,8 @@ void WasmObjectWriter::writeElemSection( encodeULEB128(TableNumber, W->OS); // the table number // init expr for starting offset - W->OS << char(wasm::WASM_OPCODE_I32_CONST); + W->OS << char(is64Bit() ? wasm::WASM_OPCODE_I64_CONST + : wasm::WASM_OPCODE_I32_CONST); encodeSLEB128(InitialTableOffset, W->OS); W->OS << char(wasm::WASM_OPCODE_END); diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 2b6bdbf..cbc55a1 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -586,6 +586,8 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx1150"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: return "gfx1151"; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: + return "gfx1152"; // AMDGCN GFX12. case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 8e2a948..0fee299 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -611,6 +611,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1103, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1150, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1151, EF_AMDGPU_MACH); + BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1152, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1200, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1201, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, EF_AMDGPU_MACH); diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 316d05b..8dd060d 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -139,6 +139,7 @@ #include "llvm/Transforms/IPO/DeadArgumentElimination.h" #include "llvm/Transforms/IPO/ElimAvailExtern.h" #include "llvm/Transforms/IPO/EmbedBitcodePass.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/IPO/ForceFunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionAttrs.h" #include "llvm/Transforms/IPO/FunctionImport.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 50682ca..dad9714 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass()) MODULE_PASS("dxil-upgrade", DXILUpgradePass()) MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass()) MODULE_PASS("extract-blocks", BlockExtractorPass({}, false)) +MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable)) MODULE_PASS("forceattrs", ForceFunctionAttrsPass()) MODULE_PASS("function-import", FunctionImportPass()) MODULE_PASS("globalopt", GlobalOptPass()) diff --git a/llvm/lib/ProfileData/MemProfReader.cpp b/llvm/lib/ProfileData/MemProfReader.cpp index fc3be71..693897f 100644 --- a/llvm/lib/ProfileData/MemProfReader.cpp +++ b/llvm/lib/ProfileData/MemProfReader.cpp @@ -690,7 +690,7 @@ Error RawMemProfReader::readNextRecord( return F; auto Iter = this->GuidToSymbolName.find(F.Function); assert(Iter != this->GuidToSymbolName.end()); - F.SymbolName = Iter->getSecond(); + F.SymbolName = std::make_unique<std::string>(Iter->getSecond()); return F; }; return MemProfReader::readNextRecord(GuidRecord, IdToFrameCallback); diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp index fcefdef9..7360901 100644 --- a/llvm/lib/Support/VirtualFileSystem.cpp +++ b/llvm/lib/Support/VirtualFileSystem.cpp @@ -867,21 +867,16 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, // Any intermediate directories we create should be accessible by // the owner, even if Perms says otherwise for the final path. const auto NewDirectoryPerms = ResolvedPerms | sys::fs::owner_all; + + StringRef Name = *I; while (true) { - StringRef Name = *I; - detail::InMemoryNode *Node = Dir->getChild(Name); + Name = *I; ++I; + if (I == E) + break; + detail::InMemoryNode *Node = Dir->getChild(Name); if (!Node) { - if (I == E) { - // End of the path. - Dir->addChild( - Name, MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime, - std::move(Buffer), ResolvedUser, ResolvedGroup, - ResolvedType, ResolvedPerms})); - return true; - } - - // Create a new directory. Use the path up to here. + // This isn't the last element, so we create a new directory. Status Stat( StringRef(Path.str().begin(), Name.end() - Path.str().begin()), getDirectoryID(Dir->getUniqueID(), Name), @@ -891,27 +886,33 @@ bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, Name, std::make_unique<detail::InMemoryDirectory>(std::move(Stat)))); continue; } + // Creating file under another file. + if (!isa<detail::InMemoryDirectory>(Node)) + return false; + Dir = cast<detail::InMemoryDirectory>(Node); + } + detail::InMemoryNode *Node = Dir->getChild(Name); + if (!Node) { + Dir->addChild(Name, + MakeNode({Dir->getUniqueID(), Path, Name, ModificationTime, + std::move(Buffer), ResolvedUser, ResolvedGroup, + ResolvedType, ResolvedPerms})); + return true; + } + if (isa<detail::InMemoryDirectory>(Node)) + return ResolvedType == sys::fs::file_type::directory_file; - if (auto *NewDir = dyn_cast<detail::InMemoryDirectory>(Node)) { - Dir = NewDir; - } else { - assert((isa<detail::InMemoryFile>(Node) || - isa<detail::InMemoryHardLink>(Node)) && - "Must be either file, hardlink or directory!"); - - // Trying to insert a directory in place of a file. - if (I != E) - return false; + assert((isa<detail::InMemoryFile>(Node) || + isa<detail::InMemoryHardLink>(Node)) && + "Must be either file, hardlink or directory!"); - // Return false only if the new file is different from the existing one. - if (auto Link = dyn_cast<detail::InMemoryHardLink>(Node)) { - return Link->getResolvedFile().getBuffer()->getBuffer() == - Buffer->getBuffer(); - } - return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() == - Buffer->getBuffer(); - } + // Return false only if the new file is different from the existing one. + if (auto *Link = dyn_cast<detail::InMemoryHardLink>(Node)) { + return Link->getResolvedFile().getBuffer()->getBuffer() == + Buffer->getBuffer(); } + return cast<detail::InMemoryFile>(Node)->getBuffer()->getBuffer() == + Buffer->getBuffer(); } bool InMemoryFileSystem::addFile(const Twine &P, time_t ModificationTime, diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 4b2ce0d..5708b61 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -85,6 +85,10 @@ def SMEUnsupported : AArch64Unsupported { SME2Unsupported.F); } +def MTEUnsupported : AArch64Unsupported { + let F = [HasMTE]; +} + let F = [HasPAuth, HasPAuthLR] in def PAUnsupported : AArch64Unsupported; @@ -109,6 +113,7 @@ include "AArch64SchedNeoverseN1.td" include "AArch64SchedNeoverseN2.td" include "AArch64SchedNeoverseV1.td" include "AArch64SchedNeoverseV2.td" +include "AArch64SchedOryon.td" include "AArch64Processors.td" diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 32646c6..941990c 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -589,6 +589,14 @@ def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 (sequence "X%u",19, 28), LR, FP)>; +// SME ABI support routines such as __arm_get_current_vg preserve most registers. +def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 + : CalleeSavedRegs<(add (sequence "Z%u", 0, 31), + (sequence "P%u", 0, 15), + (sequence "X%u", 1, 15), + (sequence "X%u",19, 28), + LR, FP)>; + // SME ABI support routines __arm_sme_state preserves most registers. def CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 : CalleeSavedRegs<(add (sequence "Z%u", 0, 31), diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 8d16709..a759efc 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -617,6 +617,27 @@ def TuneAmpere1B : SubtargetFeature<"ampere1b", "ARMProcFamily", "Ampere1B", FeatureLdpAlignedOnly, FeatureStpAlignedOnly]>; +def TuneOryon : SubtargetFeature<"oryon-1", "ARMProcFamily", + "Oryon", + "Nuvia Inc Oryon processors", [ + FeatureCrypto, + FeatureFPARMv8, + FeatureNEON, + FeatureFuseAES, + FeatureFuseAdrpAdd, + FeatureEnableSelectOptimize, + FeatureFuseCryptoEOR, + FeatureFuseAddress, + FeatureSM4, + FeatureSHA2, + FeatureSHA3, + FeatureAES, + FeatureFullFP16, + FeatureFP16FML, + FeaturePerfMon, + FeatureSPE, + FeaturePostRAScheduler, + HasV8_6aOps]>; def ProcessorFeatures { list<SubtargetFeature> A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -806,6 +827,11 @@ def ProcessorFeatures { FeatureSHA3, FeatureAES, FeatureCSSC, FeatureWFxT, FeatureFullFP16]; + list<SubtargetFeature> Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, + FeatureCrypto, FeatureRandGen, + FeaturePAuth, FeatureSM4, FeatureSHA2, + FeatureSHA3, FeatureAES]; + // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not // affect code generated by the compiler and can be used only by explicitly @@ -988,3 +1014,7 @@ def : ProcessorModel<"ampere1a", Ampere1Model, ProcessorFeatures.Ampere1A, def : ProcessorModel<"ampere1b", Ampere1BModel, ProcessorFeatures.Ampere1B, [TuneAmpere1B]>; + +// Qualcomm Oryon +def : ProcessorModel<"oryon-1", OryonModel, ProcessorFeatures.Oryon, + [TuneOryon]>; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index e97d7e3..cc50b59 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -107,13 +107,22 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " - "only supported to improve calls to SME ACLE save/restore/disable-za " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is only " + "supported to improve calls to SME ACLE save/restore/disable-za " "functions, and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + report_fatal_error( + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is " + "only supported to improve calls to SME ACLE __arm_get_current_vg " + "function, and is not intended to be used beyond that scope."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " "only supported to improve calls to SME ACLE __arm_sme_state " "and is not intended to be used beyond that scope."); if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering() @@ -153,13 +162,22 @@ AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const { if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0 is " "only supported to improve calls to SME ACLE save/restore/disable-za " "functions, and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == + CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + report_fatal_error( + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1 is " + "only supported to improve calls to SME ACLE __arm_get_current_vg " + "function, and is not intended to be used beyond that scope."); + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) report_fatal_error( - "Calling convention AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " + "Calling convention " + "AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2 is " "only supported to improve calls to SME ACLE __arm_sme_state " "and is not intended to be used beyond that scope."); if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) @@ -236,6 +254,8 @@ AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF, "Calling convention SVE_VectorCall is unsupported on Darwin."); if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask; + if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask; if (CC == CallingConv::CFGuard_Check) @@ -282,6 +302,8 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, : CSR_AArch64_SVE_AAPCS_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0_RegMask; + if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1) + return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1_RegMask; if (CC == CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2) return CSR_AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2_RegMask; if (CC == CallingConv::CFGuard_Check) @@ -643,6 +665,7 @@ bool AArch64RegisterInfo::isArgumentRegister(const MachineFunction &MF, case CallingConv::AArch64_VectorCall: case CallingConv::AArch64_SVE_VectorCall: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0: + case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X1: case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2: if (STI.isTargetWindows()) return HasReg(CC_AArch64_Win64PCS_ArgRegs, Reg); diff --git a/llvm/lib/Target/AArch64/AArch64SchedOryon.td b/llvm/lib/Target/AArch64/AArch64SchedOryon.td new file mode 100644 index 0000000..09d1af2 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedOryon.td @@ -0,0 +1,1659 @@ +//=- AArch64SchedOryon.td - Qualcomm Oryon CPU 001 ---*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the scheduling model for Qualcomm Oryon +// family of processors. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Pipeline Description. + +def OryonModel : SchedMachineModel { + let IssueWidth = 14; + let MicroOpBufferSize = 376; + let LoadLatency = 4; + let MispredictPenalty = 13; // 13 cycles for mispredicted branch. + let LoopMicroOpBufferSize = 0; // Do not have a LoopMicroOpBuffer + let PostRAScheduler = 1; // Using PostRA sched. + let CompleteModel = 1; + + list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F, + MTEUnsupported.F, + PAUnsupported.F, + [HasPAuth, HasCSSC]); +} + +let SchedModel = OryonModel in { + +// Issue ports. +// IXU has 6 ports p0 ~ p5 +// LSU has 4 ports p6 ~ p9(ls0 ~ ls3), p10/p11(std0, std1) has to work with ls0~ls3 +// VXU has 4 ports p12 ~ p15 + +// cross IXU/LSU/VXU resource group for FMOV P41 of VXU +// I2V +def ORYONI4FP0 : ProcResource<1>; +def ORYONI5FP1 : ProcResource<1>; +// V2I +def ORYONFP0I4 : ProcResource<1>; +def ORYONFP1I5 : ProcResource<1>; + +// store 1 for normal store instructions +def ORYONST0 : ProcResource<1>; +// store 2 for normal store instructions +def ORYONST1 : ProcResource<1>; + +// Port 0: ALU/Indirect/Direct Branch. +def ORYONP0 : ProcResource<1>; + +// Port 1: ALU/Direct Branch. +def ORYONP1 : ProcResource<1>; + +// Port 2: ALU. +def ORYONP2 : ProcResource<1>; + +// Port 3: ALU. +def ORYONP3 : ProcResource<1>; + +// Port 4: ALU. +def ORYONP4 : ProcResource<1> { + let Super = ORYONI4FP0; + let Super = ORYONFP0I4; } + +// Port 5: ALU. +def ORYONP5 : ProcResource<1> { + let Super = ORYONI5FP1; + let Super = ORYONFP1I5; } + +// Port 6: Load/Store. LS0 +def ORYONP6 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 7: Load/store. LS1 +def ORYONP7 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 8: Load/Store. LS2 +def ORYONP8 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 9: Load/store. LS3 +def ORYONP9 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 10: Load/Store. STD0 +def ORYONP10SD0 : ProcResource<1> { + let Super = ORYONST0; } + +// Port 11: Load/store. STD1 +def ORYONP11SD1 : ProcResource<1> { + let Super = ORYONST1; } + +// Port 12: FP/Neon/SIMD/Crypto. +def ORYONP12FP0 : ProcResource<1> { + let Super = ORYONI4FP0; + let Super = ORYONFP0I4; } + +// Port 13: FP/Neon/SIMD/Crypto. +def ORYONP13FP1 : ProcResource<1> { + let Super = ORYONI5FP1; + let Super = ORYONFP1I5; } + +// Port 14: FP/Neon/SIMD/Crypto. +def ORYONP14FP2 : ProcResource<1>; + +// Port 15: FP/Neon/SIMD/Crypto. +def ORYONP15FP3 : ProcResource<1>; + +// Define groups for the functional units on each issue port. Each group +// created will be used by a WriteRes. + +// Integer add/shift/logical/misc. instructions on port I0/I1/I2/I3/I4/I5. +def ORYONI012345 : ProcResGroup<[ORYONP0, ORYONP1, ORYONP2, + ORYONP3, ORYONP4, ORYONP5]> { + let BufferSize = 120; +} + +// Direct Conditional Branch instructions on ports I0/I1. +def ORYONI01 : ProcResGroup<[ORYONP0, ORYONP1]> { + let BufferSize = 40; +} + +// Indirect/crypto Conditional Branch instructions on ports I0. +def ORYONI0 : ProcResGroup<[ORYONP0]> { + let BufferSize = 20; +} + +// Crypto/CRC/PAU instructions on ports I2. +def ORYONI2 : ProcResGroup<[ORYONP2]> { + let BufferSize = 20; +} + +// Multiply/Multiply-ADD instructions on ports I4/I5. +def ORYONI45 : ProcResGroup<[ORYONP4, ORYONP5]> { + let BufferSize = 40; +} + +// Divide instructions on ports I5. +def ORYONI5 : ProcResGroup<[ORYONP5]> { + let BufferSize = 20; +} + +// Comparison instructions on ports I0/I1/I2/I3. +def ORYONI0123 : ProcResGroup<[ORYONP0, ORYONP1, + ORYONP2, ORYONP3]> { + let BufferSize = 80; +} + +// Load instructions on ports P6/P7/P8/P9. +def ORYONLD : ProcResGroup<[ORYONP6, ORYONP7, ORYONP8, ORYONP9]> { + let BufferSize = 64; +} + +// Store instructions on combo of STA/STD pipes +def ORYONST : ProcResGroup<[ORYONST0, ORYONST1]> { + let BufferSize = 64; +} + +// Arithmetic and CRYP-AED ASIMD/FP instructions on ports FP0/FP1/FP2/FP3. +def ORYONFP0123 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1, + ORYONP14FP2, ORYONP15FP3]> { + let BufferSize = 192; +} + +// FP Comparison and F/I move instructions on ports FP0/FP1. +def ORYONFP01 : ProcResGroup<[ORYONP12FP0, ORYONP13FP1]> { + let BufferSize = 96; +} + +// FDIV instructions on ports FP3. +def ORYONFP3 : ProcResGroup<[ORYONP15FP3]> { + let BufferSize = 48; +} + +// CRYP-SHA instructions on ports FP1. +def ORYONFP1 : ProcResGroup<[ORYONP14FP2]> { + let BufferSize = 48; +} + +def ORYONFP2 : ProcResGroup<[ORYONP14FP2]> { + let BufferSize = 48; +} + +// Reciprocal, Squre root on FP0. +def ORYONFP0 : ProcResGroup<[ORYONP12FP0]> { + let BufferSize = 48; +} + +// cross IXU/LSU/VXU resource group for FMOV P41 of VXU +// I2V +def ORYONI2V : ProcResGroup<[ORYONI4FP0, ORYONI5FP1]> { + let BufferSize = 40; +} + +// V2I +def ORYONV2I : ProcResGroup<[ORYONFP0I4, ORYONFP1I5]> { + let BufferSize = 96; +} + +// Define commonly used write types for InstRW specializations. +// All definitions follow the format: ORYONWrite_<NumCycles>Cyc_<Resources>. + +// Because of the complexity of Oryon CPU, we skip the following +// generic definitions and define each instruction specifically + +// These WriteRes entries are not used in the Falkor sched model. +def : WriteRes<WriteImm, []> { let Unsupported = 1; } +def : WriteRes<WriteI, []> { let Unsupported = 1; } +def : WriteRes<WriteISReg, []> { let Unsupported = 1; } +def : WriteRes<WriteIEReg, []> { let Unsupported = 1; } +def : WriteRes<WriteExtr, []> { let Unsupported = 1; } +def : WriteRes<WriteIS, []> { let Unsupported = 1; } +def : WriteRes<WriteID32, []> { let Unsupported = 1; } +def : WriteRes<WriteID64, []> { let Unsupported = 1; } +def : WriteRes<WriteIM32, []> { let Unsupported = 1; } +def : WriteRes<WriteIM64, []> { let Unsupported = 1; } +def : WriteRes<WriteBr, []> { let Unsupported = 1; } +def : WriteRes<WriteBrReg, []> { let Unsupported = 1; } +def : WriteRes<WriteLD, []> { let Unsupported = 1; } +def : WriteRes<WriteST, []> { let Unsupported = 1; } +def : WriteRes<WriteSTP, []> { let Unsupported = 1; } +def : WriteRes<WriteAdr, []> { let Unsupported = 1; } +def : WriteRes<WriteLDIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteSTIdx, []> { let Unsupported = 1; } +def : WriteRes<WriteF, []> { let Unsupported = 1; } +def : WriteRes<WriteFCmp, []> { let Unsupported = 1; } +def : WriteRes<WriteFCvt, []> { let Unsupported = 1; } +def : WriteRes<WriteFCopy, []> { let Unsupported = 1; } +def : WriteRes<WriteFImm, []> { let Unsupported = 1; } +def : WriteRes<WriteFMul, []> { let Unsupported = 1; } +def : WriteRes<WriteFDiv, []> { let Unsupported = 1; } +def : WriteRes<WriteVd, []> { let Unsupported = 1; } +def : WriteRes<WriteVq, []> { let Unsupported = 1; } +def : WriteRes<WriteVLD, []> { let Unsupported = 1; } +def : WriteRes<WriteVST, []> { let Unsupported = 1; } +def : WriteRes<WriteSys, []> { let Unsupported = 1; } +def : WriteRes<WriteBarrier, []> { let Unsupported = 1; } +def : WriteRes<WriteHint, []> { let Unsupported = 1; } +def : WriteRes<WriteLDHi, []> { let Unsupported = 1; } +def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } + +// These ReadAdvance entries will be defined in later implementation +def : ReadAdvance<ReadI, 0>; +def : ReadAdvance<ReadISReg, 0>; +def : ReadAdvance<ReadIEReg, 0>; +def : ReadAdvance<ReadIM, 0>; +def : ReadAdvance<ReadIMA, 0>; +def : ReadAdvance<ReadID, 0>; +def : ReadAdvance<ReadExtrHi, 0>; +def : ReadAdvance<ReadAdrBase, 0>; +def : ReadAdvance<ReadVLD, 0>; +def : ReadAdvance<ReadST, 0>; + + +//IXU resource definition +// 1 cycles NO pipe +def ORYONWrite_1Cyc_NONE : SchedWriteRes<[]>; + +// 1 cycles on I01. +def ORYONWrite_1Cyc_I01 : SchedWriteRes<[ORYONI01]>; + +def ORYONWrite_1Cyc_2Uops_I01 : SchedWriteRes<[ORYONI01]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_I0 : SchedWriteRes<[ORYONI0]>; + +// 7 cycles on I2. PAC*/AUT* instructions +def ORYONWrite_7Cyc_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 7; +} + +// 7 cycles on I2. PAC*/AUT* instructions +def ORYONWrite_7Cyc_3Uops_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 7; + let NumMicroOps = 3; +} + +// 9 (7+1+1) cycles on I2 and I0/I1, I0. Authentication branch instructions +// these instructions are broken down to three uops +// a. PtrAuth on pipe 2 taking 7 cycles +// b. Link Register Update on pipes 0 and 1 taking 1 cycle +// c. Indirect branch on pipe 0 taking 1 cycle + +def ORYONWrite_9Cyc_I012 : SchedWriteRes<[ORYONI2, ORYONI01]> { + let Latency = 9; + let NumMicroOps = 3; +} + +// 3 cycles on I2. CRC32 and CRC32C instructions +def ORYONWrite_3Cyc_I2 : SchedWriteRes<[ORYONI2]> { + let Latency = 3; +} + +// 1 cycle on I012345 +def ORYONWrite_1Cyc_I012345 : SchedWriteRes<[ORYONI012345]>; + +// 1 cycle on I0123 +def ORYONWrite_1Cyc_I0123 : SchedWriteRes<[ORYONI0123]>; + +// 1 cycle on 2 of I012345 +def ORYONWrite_1Cyc_I012345_I012345 : +SchedWriteRes<[ORYONI012345, ORYONI012345]> ; + +// 2 cycle on 2 of I0123 with ReleaseAtCycles +def ORYONWrite_2Cyc_I0123_I0123_RC : +SchedWriteRes<[ORYONI0123, ORYONI0123]> { + let Latency = 2; + let ReleaseAtCycles = [2,2]; +} + +// 2 cycle on 2 of I012345 +def ORYONWrite_2Cyc_I012345_I012345_RC : +SchedWriteRes<[ORYONI012345, ORYONI012345]> { + let Latency = 2; + let ReleaseAtCycles = [2,2]; +} + +// 3 cycle on 2 of I45 +def ORYONWrite_3Cyc_I45_I45_RC : +SchedWriteRes<[ORYONI45, ORYONI45]> { + let Latency = 3; + let ReleaseAtCycles = [2,2]; +} + +// 3 cycle on I45 +def ORYONWrite_3Cyc_I45 : SchedWriteRes<[ORYONI45]> { + let Latency = 3; +} + +// 7 cycle on I2 32-bit integer division +def ORYONWrite_7Cyc_I2_RC : SchedWriteRes<[ORYONI2]> { + let Latency = 7; + let ReleaseAtCycles = [2]; +} + +// 9 cycle on I2 64-bit integer division +def ORYONWrite_9Cyc_I2_RC : SchedWriteRes<[ORYONI2]> { + let Latency = 9; + let ReleaseAtCycles = [2]; +} + +// LSU resource definition +// need to define WriteLDAdr, WriteAdrAdr, WriteLDHi, WriteSTX +// 4 cycle on LS(P6789) +def ORYONWrite_4Cyc_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 4; +} + +// 4 cycle for Post/Pre inc/dec access, also covers all pair loads Post/Pre +def ORYONWrite_4Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 4; +} + +// 5 (4+1) for VXU SIMD access/could also include FP +// resource might not be correct, as VXU resource not included +def ORYONWrite_5Cyc_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; +} + +def ORYONWrite_5Cyc_2Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def ORYONWrite_5Cyc_3Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def ORYONWrite_5Cyc_4Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def ORYONWrite_5Cyc_5Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 5; +} + +def ORYONWrite_5Cyc_6Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def ORYONWrite_5Cyc_8Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def ORYONWrite_5Cyc_10Uops_LD : SchedWriteRes<[ORYONLD]> { + let Latency = 5; + let NumMicroOps = 10; +} + +// 6 cycle for Post/Pre inc/dec access +def ORYONWrite_5Cyc_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; +} + +def ORYONWrite_5Cyc_2Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def ORYONWrite_5Cyc_3Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 3; +} + +def ORYONWrite_5Cyc_4Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 4; +} + +def ORYONWrite_5Cyc_5Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 5; +} + +def ORYONWrite_5Cyc_6Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def ORYONWrite_5Cyc_8Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def ORYONWrite_5Cyc_10Uops_LD_I012345 : SchedWriteRes<[ORYONLD, ORYONI012345]> { + let Latency = 5; + let NumMicroOps = 10; +} + +// 1 cycle for all generic stores +def ORYONWrite_1Cyc_ST : SchedWriteRes<[ORYONST]>; + +def ORYONWrite_1Cyc_2Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_3Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_4Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 4; +} + +def ORYONWrite_1Cyc_5Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 5; +} + +def ORYONWrite_1Cyc_6Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 6; +} + +def ORYONWrite_1Cyc_8Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 8; +} + +def ORYONWrite_1Cyc_10Uops_ST : SchedWriteRes<[ORYONST]> { + let NumMicroOps = 10; +} + +// 1 cycle for neon write: float + ASIMD with Post/Pre Inc/Dec access +// also includes Pair store until further informed +def ORYONWrite_1Cyc_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_2Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 2; +} + +def ORYONWrite_1Cyc_3Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 3; +} + +def ORYONWrite_1Cyc_4Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 4; +} + +def ORYONWrite_1Cyc_5Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 5; +} + +def ORYONWrite_1Cyc_6Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 6; +} + +def ORYONWrite_1Cyc_8Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 8; +} + +def ORYONWrite_1Cyc_10Uops_ST_I012345 : SchedWriteRes<[ORYONST, ORYONI012345]> { + let NumMicroOps = 10; +} + +// VXU resource definition + +// I2V instruction has 1 uOp +// I2v with convert has 2 uOps +// all I2V, V2I's throughputs are 2 +// On VXU doc, p37 -- latencies and throughput +// P41, resource taken, P42, uOps +def ORYONWrite_I2V_4Cyc_I45 : SchedWriteRes<[ORYONI2V]> { + let Latency = 4; +} + +// inline a FCVT, so add one more uOp +def ORYONWrite_I2V_7Cyc_I45 : SchedWriteRes<[ORYONI2V]> { + let Latency = 7; + let NumMicroOps = 2; +} + +// V2I move instruction has 1/2 uOps, P42 in VXU doc +// Latency is 3, FCVT is also 3 cycle +// move + convert is 6 (3+3) cycles +// throughput is 2 +def ORYONWrite_V2I_3Cyc_FP01 : SchedWriteRes<[ORYONV2I]> { + let Latency = 3; +} + +// inline a FCVT, so add one more uOp +def ORYONWrite_V2I_6Cyc_FP01 : SchedWriteRes<[ORYONV2I]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def ORYONWrite_V2V_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 2; +} + +def ORYONWrite_V2V_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; +} + +def ORYONWrite_V2V_6Cyc_FP01 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def ORYONWrite_4Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 4; +} + +def ORYONWrite_3Cyc_FP0 : SchedWriteRes<[ORYONFP0]> { + let Latency = 3; +} + +def ORYONWrite_3Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; +} + +def ORYONWrite_3Cyc_2Uops_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 3; + let NumMicroOps = 2; +} + +def ORYONWrite_2Cyc_FP0123 : SchedWriteRes<[ORYONFP0123]> { + let Latency = 2; +} + +def ORYONWrite_2Cyc_FP01 : SchedWriteRes<[ORYONFP01]> { + let Latency = 2; +} + +// 2 cycle on FP1 +def ORYONWrite_2Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 2; +} + +// 3 cycle on FP1 +def ORYONWrite_3Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 3; +} + +// 4 cycle , 0.5 throughput on FP1 +def ORYONWrite_4Cyc_FP1_RC4 : SchedWriteRes<[ORYONFP1]> { + let Latency = 4; + let ReleaseAtCycles = [4]; +} + +// 5 cycle , 1 throughput on FP1 +def ORYONWrite_5Cyc_FP1 : SchedWriteRes<[ORYONFP1]> { + let Latency = 5; +} + +// 8 cycle , 2 throughput on FP0123 +def ORYONWrite_8Cyc_FP0123_RC : SchedWriteRes<[ORYONFP0123]> { + let Latency = 8; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_6Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 6; +} + +def ORYONWrite_7Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 7; +} + +def ORYONWrite_8Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 8; +} + +def ORYONWrite_9Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 9; +} + +def ORYONWrite_10Cyc_FP3 : SchedWriteRes<[ORYONFP3]> { + let Latency = 10; +} + +def ORYONWrite_8Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 8; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_10Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 10; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_13Cyc_FP3_RC : SchedWriteRes<[ORYONFP3]> { + let Latency = 13; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_4Cyc_FP0123_RC : +SchedWriteRes<[ORYONFP0123]> { + let Latency = 4; + let ReleaseAtCycles = [2]; +} + +def ORYONWrite_4Cyc_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123]> { + let Latency = 4; + let NumMicroOps = 2; + let ReleaseAtCycles = [2,2]; +} + +def ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123]> { + let Latency = 4; + let NumMicroOps = 3; + let ReleaseAtCycles = [3,3,3]; +} + +def ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC : +SchedWriteRes<[ORYONFP0123, ORYONFP0123, ORYONFP0123, ORYONFP0123]> { + let Latency = 6; + let NumMicroOps = 4; + let ReleaseAtCycles = [6,6,6,6]; +} + +//===----------------------------------------------------------------------===// +// Instruction Tables in IXU +//===----------------------------------------------------------------------===// + +//--- +// Arithmetic Instructions +//--- + +//1, 1, 6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^ADD(W|X)r(i|r|x)", "^SUB(W|X)r(i|r|x)")>; + +//2,2,3 +def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC], + (instregex "^ADD(W|X)rs", "^SUB(W|X)rs")>; + +//1,1,4 alias CMP, CMN on page 75 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ADDS(W|X)r(i|r|x)(64)?", "^SUBS(W|X)r(i|r|x)")>; + +//2,2,2 alias CMP, CMN on page 75 +def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC], + (instregex "^ADDS(W|X)rs", "^SUBS(W|X)rs")>; + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ADC(W|X)r","^SBC(W|X)r", + "^ADCS(W|X)r","^SBCS(W|X)r")>; + +//1,1,2 +def : InstRW<[ORYONWrite_1Cyc_2Uops_I01], + (instrs ADR,ADRP)>; + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^CSEL(W|X)r", "^CSINV(W|X)r", + "^CSNEG(W|X)r", "^CSINC(W|X)r")>; + +//--- +//Compare Instruciton +//--- + +// We have CCMP, CCMN as LLVM DAG node +// CMP is an alias of SUBS as above +// CMN is an alias of ADDS as above +// We also have no way to get shift compare node in LLVM +//2,2,1.5 CMP, CMN + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^CCMP(W|X)(i|r)", "^CCMN(W|X)(i|r)")>; + +//--- +// Branch +//--- + +def : InstRW<[ORYONWrite_1Cyc_NONE], (instrs B)>; +def : InstRW<[ORYONWrite_1Cyc_I01], (instrs BL)>; +def : InstRW<[ORYONWrite_1Cyc_I01], + (instrs Bcc, CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[ORYONWrite_1Cyc_I0], (instrs BR, BLR)>; +def : InstRW<[ORYONWrite_1Cyc_I0], (instrs RET)>; + +// 3 uOp, 1 cycle for branch, 7 cycle for Authentication, +// 1 cycle for updating link register +// V8.3a PAC +def : InstRW<[ORYONWrite_9Cyc_I012], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ, + BRAA, BRAAZ, BRAB, BRABZ)>; +def : InstRW<[ORYONWrite_9Cyc_I012], (instrs RETAA, RETAB, ERETAA, ERETAB)>; + +def : InstRW<[ORYONWrite_7Cyc_3Uops_I2], (instregex "^LDRAA", "^LDRAB")>; + +// Logical Instructions +//--- + +//1,1,4 TST is an alias of ANDS +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^ANDS(W|X)r(i|r|x)", "^BICS(W|X)r(i|r|x)")>; + +//2,2,2 TST shift is an alias +def : InstRW<[ORYONWrite_2Cyc_I0123_I0123_RC], + (instregex "^ANDS(W|X)rs", "^BICS(W|X)rs")>; + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^AND(W|X)r(i|r|x)", "^EOR(W|X)r(i|r|x)", + "^ORR(W|X)r(i|r|x)", "^BIC(W|X)r(i|r|x)", + "^EON(W|X)r(i|r|x)", "^ORN(W|X)r(i|r|x)")>; + +//2,2,3 +def : InstRW<[ORYONWrite_2Cyc_I012345_I012345_RC], + (instregex "^AND(W|X)rs", "^EOR(W|X)rs", "^ORR(W|X)rs", + "^BIC(W|X)rs", "^EON(W|X)rs", "^ORN(W|X)rs")>; + + +//--- +// Shift Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^ASRV(W|X)r", "^LSLV(W|X)r", + "^LSRV(W|X)r", "^RORV(W|X)r", + "RMIF")>; + +//--- +// Move-Data Bit-field and Sign_Extension Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^MOVK(W|X)i", "^MOVN(W|X)i", + "^MOVZ(W|X)i", "^SBFM(W|X)ri", + "^UBFM(W|X)ri", "^BFM(W|X)ri", + "^SXT(W|B|H|X)", "^UXT(H|B)")>; + +// COPY instruction is an LLVM internal DAG node, needs further study +def : InstRW<[ORYONWrite_1Cyc_I012345], (instrs COPY)>; + +//--- +// Reverse Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^RBIT(W|X)r", "^REV(16|32|64)?(W|X)r")>; + + +//--- +// Flag Manipulate Instructions +//--- + +//1,1,4 +def : InstRW<[ORYONWrite_1Cyc_I0123], + (instregex "^SETF8", "^SETF16", "^CFINV")>; + +//--- +// Miscellaneous Instructions +//--- + +//1,1,6 +def : InstRW<[ORYONWrite_1Cyc_I012345], + (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$", "^EXTR(W|X)rri")>; + + +//--- +// Multiply Instructions +//--- + +//1,3,2 +def : InstRW<[ORYONWrite_3Cyc_I45], + (instregex "^MADD(W|X)rrr", "^MSUB(W|X)rrr", + "^(S|U)MADDLrrr", "^(S|U)MSUBLrrr", + "^(S|U)MULHrr")>; + +//--- +// Divide Instructions +//--- + +def : InstRW<[ORYONWrite_7Cyc_I2_RC], + (instregex "^(S|U)DIVWr")>; + +def : InstRW<[ORYONWrite_9Cyc_I2_RC], + (instregex "^(S|U)DIVXr")>; + + +//--- +// Cryptgraphy Instructions +// +//1,3,1 on I2 +def : InstRW<[ORYONWrite_3Cyc_I2], + (instregex "^CRC32(B|H|W|X)rr", "^CRC32C(B|H|W|X)rr")>; + +//--- +// PAU instructions +//--- + +// on p47 of IXU document, we have 7 cycles for all PAU instructions +// here we just assume all signing and pauth instructions are 7 cycles +// assume all are 7 cycles here + +// signing instrucitons +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs PACIA, PACIB, + PACDA, PACDB, + PACIZA, PACIZB, + PACDZA, PACDZB, + PACGA)>; +// authentication instrucitons +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs AUTIA, AUTIB, + AUTDA, AUTDB, + AUTIZA, AUTIZB, + AUTDZA, AUTDZB)>; +def : InstRW<[ORYONWrite_7Cyc_I2], (instrs XPACI, XPACD)>; + +//===----------------------------------------------------------------------===// +// Instruction Tables in LSU +//===----------------------------------------------------------------------===// + +// 4 cycle Load-to-use from L1D$ +// Neon load with 5 cycle +// 6 cycle to STA ? +// STD cycle ? +// NEON STD + 2 + +// Load Instructions +// FP Load Instructions + +// Load pair, immed pre-index, normal +// Load pair, immed pre-index, signed words +// Load pair, immed post-index, normal +// Load pair, immed post-index, signed words +// NOTE: Handled by WriteLD, WriteLDHi, WriteAdr. + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDNPXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPSWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDPXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQui)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSui)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWl)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXl)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRXi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSBXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSHXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDTRSWi)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPDpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPQpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPSpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPWpre)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpre)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSBXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSHXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBBpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpre)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHHpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPDpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPQpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPSpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], + (instrs LDPXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRBpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRDpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRHpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRQpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRSpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRWpost)>; +def : InstRW<[ORYONWrite_4Cyc_LD_I012345], (instrs LDRXpost)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroW)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroW)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRBroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRDroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHHroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRHroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRQroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHWroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRSHXroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRWroX)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDRXroX)>; + +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURBBi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURDi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURHHi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURQi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSBXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHWi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSHXi)>; +def : InstRW<[ORYONWrite_4Cyc_LD], (instrs LDURSWi)>; + + + +// Store register, immed post-index +// NOTE: Handled by WriteST, ReadAdrBase + +// Store register, immed pre-index +// NOTE: Handled by WriteST + +// Store pair, immed post-index, W-form +// Store pair, immed post-indx, X-form +// Store pair, immed pre-index, W-form +// Store pair, immed pre-index, X-form +// NOTE: Handled by WriteSTP. + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURBBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURHHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURSi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURWi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STURXi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRBi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRHi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRWi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STTRXi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPXi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STNPWi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPDi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPQi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPXi)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STPWi)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRBui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRDui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRHui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRQui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRXui)>; +def : InstRW<[ORYONWrite_1Cyc_ST], (instrs STRWui)>; + +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPDpre, STPDpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPSpre, STPSpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPWpre, STPWpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STPXpre, STPXpost)>; + +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRBpre, STRBpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRBBpre, STRBBpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRDpre, STRDpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRHpre, STRHpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRHHpre, STRHHpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRQpre, STRQpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRSpre, STRSpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRWpre, STRWpost)>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instrs STRXpre, STRXpost)>; + +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRBroW, STRBroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRDroW, STRDroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRHroW, STRHroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRHHroW, STRHHroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRQroW, STRQroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRSroW, STRSroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRWroW, STRWroX)>; +def : InstRW<[ORYONWrite_1Cyc_ST], + (instrs STRXroW, STRXroX)>; + +// ASIMD Load instructions, 4 cycle access + 2 cycle NEON access +// ASIMD load, 1 element, multiple, 1 reg, D-form 1uOps +// ASIMD load, 1 element, multiple, 1 reg, Q-form 1uOps +def : InstRW<[ORYONWrite_5Cyc_LD], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_LD_I012345], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 2 reg, D-form 3 uOps +// ASIMD load, 1 element, multiple, 2 reg, Q-form 2 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD1Twov(8b|4h|2s|1d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], + (instregex "^LD1Twov(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 3 reg, D-form 4 uOps +// ASIMD load, 1 element, multiple, 3 reg, Q-form 3 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD1Threev(8b|4h|2s|1d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD1Threev(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>; + +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, multiple, 4 reg, D-form 6 uOps +// ASIMD load, 1 element, multiple, 4 reg, Q-form 4 uOps +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD1Fourv(8b|4h|2s|1d)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD1Fourv(16b|8h|4s|2d)$")>; + +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 1 element, one lane, B/H/S 2uOps +// ASIMD load, 1 element, one lane, D 2UOps +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], (instregex "^LD1i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1i(8|16|32|64)_POST$")>; + +// ASIMD load, 1 element, all lanes, D-form, B/H/S 2uOps +// ASIMD load, 1 element, all lanes, D-form, D 2uOps +// ASIMD load, 1 element, all lanes, Q-form 2uOps +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_2Uops_LD_I012345], + (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, multiple, D-form, B/H/S 3 uOps +// ASIMD load, 2 element, multiple, Q-form, D 4 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD2Twov(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 2 element, one lane, B/H 3 uOps +// ASIMD load, 2 element, one lane, S 3 uOps +// ASIMD load, 2 element, one lane, D 3 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], (instregex "^LD2i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2i(8|16|32|64)_POST$")>; + +// ASIMD load, 2 element, all lanes, D-form, B/H/S 3 uOps +// ASIMD load, 2 element, all lanes, D-form, D 3 uOps +// ASIMD load, 2 element, all lanes, Q-form 3 uOps +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_3Uops_LD_I012345], + (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, multiple, D-form, B/H/S 5 uOps +// ASIMD load, 3 element, multiple, Q-form, B/H/S 6 uOps +// ASIMD load, 3 element, multiple, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD3Threev(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD load, 3 element, one lone, B/H 4 uOps +// ASIMD load, 3 element, one lane, S 4 uOps +// ASIMD load, 3 element, one lane, D 5 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], (instregex "^LD3i(8|16|32)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD3i(64)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD3i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3i(64)_POST$")>; + +// ASIMD load, 3 element, all lanes, D-form, B/H/S 4 uOps +// ASIMD load, 3 element, all lanes, D-form, D 5 uOps +// ASIMD load, 3 element, all lanes, Q-form, B/H/S 4 uOps +// ASIMD load, 3 element, all lanes, Q-form, D 5 uOps +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD3Rv(1d|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_4Uops_LD_I012345], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD3Rv(1d|2d)_POST$")>; + +// ASIMD load, 4 element, multiple, D-form, B/H/S 6 uOps +// ASIMD load, 4 element, multiple, Q-form, B/H/S 10 uOps +// ASIMD load, 4 element, multiple, Q-form, D 8 uOps +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD4Fourv(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_5Cyc_10Uops_LD], + (instregex "^LD4Fourv(16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_8Uops_LD], + (instregex "^LD4Fourv(2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_10Uops_LD_I012345], + (instregex "^LD4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_8Uops_LD_I012345], + (instregex "^LD4Fourv(2d)_POST$")>; + +// ASIMD load, 4 element, one lane, B/H 5 uOps +// ASIMD load, 4 element, one lane, S 5 uOps +// ASIMD load, 4 element, one lane, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], (instregex "^LD4i(8|16|32)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], (instregex "^LD4i(64)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD4i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4i(64)_POST$")>; + +// ASIMD load, 4 element, all lanes, D-form, B/H/S 5 uOps +// ASIMD load, 4 element, all lanes, D-form, D 6 uOps +// ASIMD load, 4 element, all lanes, Q-form, B/H/S 5 uOps +// ASIMD load, 4 element, all lanes, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD], + (instregex "^LD4Rv(1d|2d)$")>; +def : InstRW<[ORYONWrite_5Cyc_5Uops_LD_I012345], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_5Cyc_6Uops_LD_I012345], + (instregex "^LD4Rv(1d|2d)_POST$")>; + +// ASIMD Store Instructions +// ASIMD store, 1 element, multiple, 1 reg, D-form 1 uOps +// ASIMD store, 1 element, multiple, 1 reg, Q-form 1 uops +def : InstRW<[ORYONWrite_1Cyc_ST], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_ST_I012345], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 2 reg, D-form 2 uOps +// ASIMD store, 1 element, multiple, 2 reg, Q-form 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 3 reg, D-form 3 uOps +// ASIMD store, 1 element, multiple, 3 reg, Q-form 3 uOps +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, multiple, 4 reg, D-form 4 uOps +// ASIMD store, 1 element, multiple, 4 reg, Q-form 4 uOps +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 1 element, one lane, B/H/S 2 uOps +// ASIMD store, 1 element, one lane, D 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST1i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST1i(8|16|32|64)_POST$")>; + +// ASIMD store, 2 element, multiple, D-form, B/H/S 2 uOps +// ASIMD store, 2 element, multiple, Q-form, B/H/S 4 uOps +// ASIMD store, 2 element, multiple, Q-form, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST2Twov(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST2Twov(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 2 element, one lane, B/H/S 2 uOps +// ASIMD store, 2 element, one lane, D 2 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], + (instregex "^ST2i(8|16|32|64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST2i(8|16|32|64)_POST$")>; + +// ASIMD store, 3 element, multiple, D-form, B/H/S 4 uOps +// ASIMD store, 3 element, multiple, Q-form, B/H/S 6 uOps +// ASIMD store, 3 element, multiple, Q-form, D 6 uOps +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], + (instregex "^ST3Threev(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_6Uops_ST], + (instregex "^ST3Threev(16b|8h|4s|2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST3Threev(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_6Uops_ST_I012345], + (instregex "^ST3Threev(16b|8h|4s|2d)_POST$")>; + +// ASIMD store, 3 element, one lane, B/H 2 uOps +// ASIMD store, 3 element, one lane, S 2 uOps +// ASIMD store, 3 element, one lane, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST], (instregex "^ST3i(8|16|32)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST3i(64)$")>; +def : InstRW<[ORYONWrite_1Cyc_2Uops_ST_I012345], + (instregex "^ST3i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST3i(64)_POST$")>; + + +// ASIMD store, 4 element, multiple, D-form, B/H/S 5 uOps +// ASIMD store, 4 element, multiple, Q-form, B/H/S 10 uOps +// ASIMD store, 4 element, multiple, Q-form, D 8 uOps +def : InstRW<[ORYONWrite_1Cyc_5Uops_ST], + (instregex "^ST4Fourv(8b|4h|2s)$")>; +def : InstRW<[ORYONWrite_1Cyc_10Uops_ST], + (instregex "^ST4Fourv(16b|8h|4s)$")>; +def : InstRW<[ORYONWrite_1Cyc_8Uops_ST], + (instregex "^ST4Fourv(2d)$")>; +def : InstRW<[ORYONWrite_1Cyc_5Uops_ST_I012345], + (instregex "^ST4Fourv(8b|4h|2s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_10Uops_ST_I012345], + (instregex "^ST4Fourv(16b|8h|4s)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_8Uops_ST_I012345], + (instregex "^ST4Fourv(2d)_POST$")>; + +// ASIMD store, 4 element, one lane, B/H 3 uOps +// ASIMD store, 4 element, one lane, S 3 uOps +// ASIMD store, 4 element, one lane, D 4 uOps +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST], (instregex "^ST4i(8|16|32)$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST], (instregex "^ST4i(64)$")>; +def : InstRW<[ORYONWrite_1Cyc_3Uops_ST_I012345], + (instregex "^ST4i(8|16|32)_POST$")>; +def : InstRW<[ORYONWrite_1Cyc_4Uops_ST_I012345], + (instregex "^ST4i(64)_POST$")>; + + +//===----------------------------------------------------------------------===// +// Instruction Tables in VXU +//===----------------------------------------------------------------------===// +// all uOps are not clearly written in the VXU document + +// I2V +def : InstRW<[ORYONWrite_I2V_4Cyc_I45], (instregex "^FMOV[HSD][WX]r", "^FMOVDXHighr")>; + +// I2V with convert +def : InstRW<[ORYONWrite_I2V_7Cyc_I45], (instregex "^[SU]CVTF[SU][XW][HSD]ri")>; + +// V2I +def : InstRW<[ORYONWrite_V2I_3Cyc_FP01], (instregex "^FMOV[WX][HSD]r", "FMOVXDHighr")>; + +// V2I with convert 2nd [SU] necessary? +def : InstRW<[ORYONWrite_V2I_6Cyc_FP01], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; + +// float to float move immediate, row 7 in big chart +def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]r")>; +def : InstRW<[ORYONWrite_V2V_2Cyc_FP0123], (instregex "^FMOV[HSD]i")>; + +// float to float conversion within VXU, precision conversion +def : InstRW<[ORYONWrite_V2V_6Cyc_FP01], (instregex "^FJCVTZS")>; +def : InstRW<[ORYONWrite_V2V_3Cyc_FP0123], (instregex "^FCVT[HSD][HSD]r", + "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>; + +// floating comparison write to NZCV +def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCMP(E)?[HSD]r[ir]")>; +def : InstRW<[ORYONWrite_2Cyc_FP01], (instregex "^FCCMP(E)?[HSD]rr")>; + +// floating point conditional select +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FCSEL")>; + +// floating multiply-add +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^(F|FN)MADD", "^(F|FN)MSUB")>; + +// floating unary, cycle/throughput? xls row14 +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^F(ABS|NEG)[SD]r")>; + +//floating division/square root +def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVHrr")>; +def : InstRW<[ORYONWrite_8Cyc_FP3], (instregex "^FDIVSrr")>; +def : InstRW<[ORYONWrite_10Cyc_FP3], (instregex "^FDIVDrr")>; + +def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTHr")>; +def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTSr")>; +def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTDr")>; + +//========== +// SIMD move instructions +//========== + +// ASIMD DUP element +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^DUPv.+lane")>; +// ASIMD DUP general thoughput undecided, 3? FP0123 +// VXU doc, p42, 2 uOps +def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^DUPv.+gpr")>; + +// ASIMD insert, element to element +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^INSv.+lane")>; +// ASIMD insert, gen reg 3? FP0123? +def : InstRW<[ORYONWrite_3Cyc_2Uops_FP0123], (instregex "^INSv.+gpr")>; + +// ASIMD move, FP immed +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^FMOVv")>; + +// ASIMD transfer, element to gen reg +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^[SU]MOVv")>; + +//========== +// SIMD arithmetic instructions +//========== +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDv", "^SUBv", + "^BIFv", "^BITv", "^BSLv", + "^ANDv", "^BICv", "^EORv", + "^ORRv", "^ORNv")>; + + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FABDv", "^FADDv", "^FSUBv")>; + +// floating division +def : InstRW<[ORYONWrite_6Cyc_FP3], (instregex "^FDIVv.*16$")>; +def : InstRW<[ORYONWrite_7Cyc_FP3], (instregex "^FDIVv.*32$")>; +def : InstRW<[ORYONWrite_9Cyc_FP3], (instregex "^FDIVv.*64$")>; + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMUL(X)?v", + "^FRECPSv", "^FRSQRTSv")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^MLAv","^MLSv", "^MULv", + "^PMULv", "UABAv")>; + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "SABAv", "SABDv", + "^(SH|UH)(ADD|SUB)v", + "^S(MAX|MIN)v", + "^(SQ|UQ)(ADD|SUB)v", + "^(SQ|SQR|UQ|UQR)SHLv", + "^(SR|UR)HADDv", + "^(SR|UR)SHLv", + "^UABDv", + "^U(MAX|MIN)v")>; +// IMAX or UMAX in the above line +//========== +// SIMD compare instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^CMEQv","^CMGEv","^CMGTv", + "^CMLEv","^CMLTv", "^CMHIv", + "^CMHSv", + "^FCMEQv", "^FCMGEv", + "^FCMGTv", "^FCMLEv", + "^FCMLTv", + "^FACGEv", "^FACGTv")>; + +//========== +// SIMD widening and narrowing arithmetic instructions +//========== +// NO need to list ADDHN2, RADDHN2, RSUBHN2 as they are not distinguished +// from ADDHN, RADDHN, RSUBHN in td file(v16i8, v8i16, v4i32). +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDHNv", + "^SUBHNv", + "^RADDHNv", + "^RSUBHNv", + "^SABD(L|L2)v", "^UABD(L|L2)v", + "^(S|U)(ADD|SUB)(L|L2|W|W2)v")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^PMUL(L|L2)v","^SABA(L|L2)v", + "^(S|U|SQ)(MLA|MSL|MUL)(L|L2)v")>; + +//========== +// SIMD unary arithmetic instructions +//========== +//^MVNv is an alias of ^NOTv +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ABSv", "^CLSv","^CLZv", "^CNTv", + "^NEGv", "^NOTv", + "^RBITv", "^REV(16|32|64)v", + "^SQ(ABS|NEG)v", "^SQ(XT|XTU)(N|N2)v", + "^(SU|US)QADDv", + "^UQXT(N|N2)v", "^XTN2?v")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^FCVT(L|L2|N|N2|XN|XN2)v", + "^FRINT[AIMNPXZ]v", + "^FRSQRTEv", + "^(S|U)ADALPv", + "^(S|U)ADDLPv")>; + + +def : InstRW<[ORYONWrite_3Cyc_FP0], (instregex "^URECPEv", "^URSQRTEv", + "^FRECPEv", "^FRECPXv")>; + +def : InstRW<[ORYONWrite_8Cyc_FP3_RC], (instregex "^FSQRTv.*16$")>; +def : InstRW<[ORYONWrite_10Cyc_FP3_RC], (instregex "^FSQRTv.*32$")>; +def : InstRW<[ORYONWrite_13Cyc_FP3_RC], (instregex "^FSQRTv.*64$")>; + +//========== +// SIMD binary elememt arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FMLAv", "^FMLSv")>; + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^SQDMULHv", + "^SQRD(MLA|MLS|MUL)Hv")>; + +//========== +// SIMD permute instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^EXTv", "^TRN(1|2)v", + "^UZP(1|2)v", "^ZIP(1|2)v")>; + +//========== +// SIMD immediate instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^MOVIv", "^MVNIv")>; + +//========== +// SIMD shift(immediate) instructions +//========== +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^RSHR(N|N2)v", "^SHLv", + "^(SHL|SHR)(N|N2)v", + "^SLIv", + "^(SQ|SQR)SHR(U)?(N|N2)v", + "^(UQ|UQR)SHR(N|N2)v", + "^SQSHLUv", + "^SRIv", + "^(S|SR|U|UR)SHRv", + "^(S|SR|U|UR)SRAv", + "^(S|U)SHL(L|L2)v")>; + +//========== +// SIMD floating-point and integer conversion instructions +//========== +// same as above conversion + +//========== +// SIMD reduce (acoss vector lanes) instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDVv", + "^(FMAX|FMIN)(V|NMV)v", + "^(S|U)ADDLVv", + "^(S|U)(MAX|MIN)Vv")>; +//========== +// SIMD pairwise arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^ADDPv", "^FADDPv", + "^(FMAX|FMIN)(NMP|P)v", + "^(S|U)(MIN|MAX)Pv")>; +//========== +// SIMD dot prodcut instructions +//========== + +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(U|S)DOTv")>; + +//========== +// SIMD table lookup instructions +//========== +// TBL 1-reg/2-reg; TBX 1-reg, 1uOp, throughput=4 latency=2 +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instrs TBLv8i8One, TBLv16i8One, + TBXv8i8One, TBXv16i8One, + TBLv8i8Two, TBLv16i8Two)>; + +// TBL 3-reg/4-reg, 3uops, throughtput=4/3=1.33 latency=4 +def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_FP0123_RC], + (instrs TBLv8i8Three, TBLv16i8Three, + TBLv8i8Four, TBLv16i8Four)>; + + +// TBX 2-reg 2 uOps, throughput=2 latency=4 +def : InstRW<[ORYONWrite_4Cyc_FP0123_FP0123_RC], (instrs TBXv8i8Two, TBXv16i8Two)>; + +// TBX 3-reg/4-reg, 4uOps, throughput=1, latency=6 +def : InstRW<[ORYONWrite_6Cyc_FP0123_FP0123_FP0123_FP0123_RC], + (instrs TBXv8i8Three, TBXv16i8Three, + TBXv8i8Four, TBXv16i8Four)>; + + +//========== +// SIMD complex number arithmetic instructions +//========== + +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^FCADDv", "^FCMLAv")>; + +//========== +// SIMD cryptographic instructions +//========== +// 3,4 on IMLA, CRYP +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^AES[DE]", + "^SM3(TT1|TT2)(A|B)")>; + +// 2,4 on CRYP +def : InstRW<[ORYONWrite_2Cyc_FP0123], (instregex "^AESI?MC", + "^EOR3", + "^RAX1", + "^XAR", + "^BCAX", + "^SM3SS1", + "^SM3PART(W1|W2)")>; +// 5,1 on CRYP +def : InstRW<[ORYONWrite_5Cyc_FP1], (instregex "^SM4E", + "^SM4EKEY")>; + +// 2,1 on CRYP +def : InstRW<[ORYONWrite_2Cyc_FP1], (instregex "^SHA1(H|SU0|SU1)", + "^SHA256SU0", + "^SHA512(SU0|SU1)")>; + +// 3,1 on CRYP +def : InstRW<[ORYONWrite_3Cyc_FP1], (instregex "^SHA256SU1", + "^SHA512(H|H2)")>; + +// 4,0.25 on CRYP +def : InstRW<[ORYONWrite_4Cyc_FP1_RC4], (instregex "^SHA1(C|P|M)", + "^SHA256(H|H2)")>; + +//========== +// SIMD v8.6 instructions +//========== +// 4,2 on IMLA +def : InstRW<[ORYONWrite_4Cyc_FP0123_RC], (instregex "^(S|U|US)MMLA$")>; + +// 4,0.5 on IMLA +def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMMLA$")>; + +// 4,0.5 on IMLA +def : InstRW<[ORYONWrite_8Cyc_FP0123_RC], (instregex "^BFMLAL(B|T)")>; + +// 3,4 +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^(US|SU)DOTv")>; + +// 3,1 +def : InstRW<[ORYONWrite_4Cyc_FP0123], (instregex "^BF(16)?DOTv")>; + +// 3,4 +def : InstRW<[ORYONWrite_3Cyc_FP0123], (instregex "^BFCVT(N|N2)?$")>; + + +} // SchedModel = OryonModel diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 8bc26ee..93ea729 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -299,6 +299,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { PrefLoopAlignment = Align(64); MaxInterleaveFactor = 4; break; + case Oryon: + CacheLineSize = 64; + PrefFunctionAlignment = Align(16); + MaxInterleaveFactor = 4; + PrefetchDistance = 128; + MinPrefetchStride = 1024; + break; } if (AArch64MinimumJumpTableEntries.getNumOccurrences() > 0 || !HasMinSize) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index f49c73d..9f5756f 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM( static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden); +static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", + cl::init(true), cl::Hidden); + namespace { class TailFoldingOption { // These bitfields will only ever be set to something non-zero in operator=, @@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) { return true; return BaseT::shouldTreatInstructionLikeSelect(I); } + +bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2) { + // AArch64 specific here is adding the number of instructions to the + // comparison (though not as the first consideration, as some targets do) + // along with changing the priority of the base additions. + // TODO: Maybe a more nuanced tradeoff between instruction count + // and number of registers? To be investigated at a later date. + if (EnableLSRCostOpt) + return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost, + C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost, + C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost); + + return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); +}
\ No newline at end of file diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index 2f44aaa..feec1a4 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -425,6 +425,9 @@ public: } std::optional<unsigned> getMinPageSize() const { return 4096; } + + bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, + const TargetTransformInfo::LSRCost &C2); }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 8e30278..d0d7a9d 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1534,6 +1534,12 @@ def FeatureISAVersion11_5_1 : FeatureSet< FeatureVGPRSingleUseHintInsts, Feature1_5xVGPRs])>; +def FeatureISAVersion11_5_2 : FeatureSet< + !listconcat(FeatureISAVersion11_Common.Features, + [FeatureSALUFloatInsts, + FeatureDPPSrc1SGPR, + FeatureVGPRSingleUseHintInsts])>; + def FeatureISAVersion12 : FeatureSet< [FeatureGFX12, FeatureLDSBankCount32, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp index 625ac02..2bdbf41 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp @@ -1017,7 +1017,7 @@ public: // // TODO: We could filter out subgraphs that do not access LDS globals. for (Function *F : KernelsThatAllocateTableLDS) - removeFnAttrFromReachable(CG, F, "amdgpu-no-lds-kernel-id"); + removeFnAttrFromReachable(CG, F, {"amdgpu-no-lds-kernel-id"}); } DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS = diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 9c94ca1..17c9615 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -57,6 +57,7 @@ #include "llvm/Transforms/HipStdPar/HipStdPar.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/ExpandVariadics.h" #include "llvm/Transforms/IPO/GlobalDCE.h" #include "llvm/Transforms/IPO/Internalize.h" #include "llvm/Transforms/Scalar.h" @@ -992,6 +993,10 @@ void AMDGPUPassConfig::addIRPasses() { if (isPassEnabled(EnableImageIntrinsicOptimizer)) addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM)); + // This can be disabled by passing ::Disable here or on the command line + // with --expand-variadics-override=disable. + addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering)); + // Function calls are not supported, so make sure we inline everything. addPass(createAMDGPUAlwaysInlinePass()); addPass(createAlwaysInlinerLegacyPass()); diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index 2ada981..d218ffe 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -295,7 +295,11 @@ def : ProcessorModel<"gfx1151", GFX11SpeedModel, FeatureISAVersion11_5_1.Features >; -// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151] +def : ProcessorModel<"gfx1152", GFX11SpeedModel, + FeatureISAVersion11_5_2.Features +>; + +// [gfx1100, gfx1101, gfx1102, gfx1103, gfx1150, gfx1151, gfx1152] def : ProcessorModel<"gfx11-generic", GFX11SpeedModel, FeatureISAVersion11_Generic.Features >; diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index d7d6e00..e805e96 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -113,6 +113,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break; + case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152: AK = GK_GFX1152; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC: AK = GK_GFX9_GENERIC; break; @@ -196,6 +197,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103; case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150; case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151; + case GK_GFX1152: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1152; case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200; case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201; case GK_GFX9_GENERIC: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC; diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index c47eea2..8b42d4a 100644 --- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -2052,9 +2052,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList) const { - if (!(MI.mayLoad() ^ MI.mayStore())) - return false; - if (!STM->hasFlatInstOffsets() || !SIInstrInfo::isFLAT(MI)) return false; @@ -2065,10 +2062,6 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( unsigned AS = SIInstrInfo::isFLATGlobal(MI) ? AMDGPUAS::GLOBAL_ADDRESS : AMDGPUAS::FLAT_ADDRESS; - if (MI.mayLoad() && - TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != nullptr) - return false; - if (AnchorList.count(&MI)) return false; diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp index f178324..5dc3457 100644 --- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -103,8 +103,6 @@ private: MachineBasicBlock *emitEndCf(MachineInstr &MI); - void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI); - void findMaskOperands(MachineInstr &MI, unsigned OpNo, SmallVectorImpl<MachineOperand> &Src) const; @@ -709,95 +707,6 @@ MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) { return SplitBB; } -void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB, - MachineInstr &MI) { - MachineFunction &MF = *MBB->getParent(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - bool IsWave32 = ST.isWave32(); - - if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { - // This should be before all vector instructions. - MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), - TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec) - .addImm(MI.getOperand(0).getImm()); - if (LIS) { - LIS->RemoveMachineInstrFromMaps(MI); - LIS->InsertMachineInstrInMaps(*InitMI); - } - MI.eraseFromParent(); - return; - } - - // Extract the thread count from an SGPR input and set EXEC accordingly. - // Since BFM can't shift by 64, handle that case with CMP + CMOV. - // - // S_BFE_U32 count, input, {shift, 7} - // S_BFM_B64 exec, count, 0 - // S_CMP_EQ_U32 count, 64 - // S_CMOV_B64 exec, -1 - Register InputReg = MI.getOperand(0).getReg(); - MachineInstr *FirstMI = &*MBB->begin(); - if (InputReg.isVirtual()) { - MachineInstr *DefInstr = MRI->getVRegDef(InputReg); - assert(DefInstr && DefInstr->isCopy()); - if (DefInstr->getParent() == MBB) { - if (DefInstr != FirstMI) { - // If the `InputReg` is defined in current block, we also need to - // move that instruction to the beginning of the block. - DefInstr->removeFromParent(); - MBB->insert(FirstMI, DefInstr); - if (LIS) - LIS->handleMove(*DefInstr); - } else { - // If first instruction is definition then move pointer after it. - FirstMI = &*std::next(FirstMI->getIterator()); - } - } - } - - // Insert instruction sequence at block beginning (before vector operations). - const DebugLoc DL = MI.getDebugLoc(); - const unsigned WavefrontSize = ST.getWavefrontSize(); - const unsigned Mask = (WavefrontSize << 1) - 1; - Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); - auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) - .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); - if (LV) - LV->recomputeForSingleDefVirtReg(InputReg); - auto BfmMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) - .addReg(CountReg) - .addImm(0); - auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) - .addReg(CountReg, RegState::Kill) - .addImm(WavefrontSize); - if (LV) - LV->getVarInfo(CountReg).Kills.push_back(CmpMI); - auto CmovMI = - BuildMI(*MBB, FirstMI, DL, - TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), - Exec) - .addImm(-1); - - if (!LIS) { - MI.eraseFromParent(); - return; - } - - LIS->RemoveMachineInstrFromMaps(MI); - MI.eraseFromParent(); - - LIS->InsertMachineInstrInMaps(*BfeMI); - LIS->InsertMachineInstrInMaps(*BfmMI); - LIS->InsertMachineInstrInMaps(*CmpMI); - LIS->InsertMachineInstrInMaps(*CmovMI); - - RecomputeRegs.insert(InputReg); - LIS->createAndComputeVirtRegInterval(CountReg); -} - bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) { for (auto &I : MBB.instrs()) { if (!I.isDebugInstr() && !I.isUnconditionalBranch()) @@ -927,18 +836,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { SplitMBB = process(MI); Changed = true; break; - - // FIXME: find a better place for this - case AMDGPU::SI_INIT_EXEC: - case AMDGPU::SI_INIT_EXEC_FROM_INPUT: - lowerInitExec(MBB, MI); - if (LIS) - LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC); - Changed = true; - break; - - default: - break; } if (SplitMBB != MBB) { diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp index 09dc1c7..5b4c443 100644 --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -177,6 +177,7 @@ private: SmallVector<MachineInstr *, 4> LowerToMovInstrs; SmallVector<MachineInstr *, 4> LowerToCopyInstrs; SmallVector<MachineInstr *, 4> KillInstrs; + SmallVector<MachineInstr *, 4> InitExecInstrs; void printInfo(); @@ -223,6 +224,8 @@ private: void lowerLiveMaskQueries(); void lowerCopyInstrs(); void lowerKillInstrs(bool IsWQM); + void lowerInitExec(MachineInstr &MI); + void lowerInitExecInstrs(); public: static char ID; @@ -580,6 +583,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF, Opcode == AMDGPU::SI_DEMOTE_I1) { KillInstrs.push_back(&MI); BBI.NeedsLowering = true; + } else if (Opcode == AMDGPU::SI_INIT_EXEC || + Opcode == AMDGPU::SI_INIT_EXEC_FROM_INPUT) { + InitExecInstrs.push_back(&MI); } else if (WQMOutputs) { // The function is in machine SSA form, which means that physical // VGPRs correspond to shader inputs and outputs. Inputs are @@ -1556,6 +1562,97 @@ void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) { } } +void SIWholeQuadMode::lowerInitExec(MachineInstr &MI) { + MachineBasicBlock *MBB = MI.getParent(); + bool IsWave32 = ST->isWave32(); + + if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) { + // This should be before all vector instructions. + MachineInstr *InitMI = + BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(), + TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), + Exec) + .addImm(MI.getOperand(0).getImm()); + if (LIS) { + LIS->RemoveMachineInstrFromMaps(MI); + LIS->InsertMachineInstrInMaps(*InitMI); + } + MI.eraseFromParent(); + return; + } + + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + Register InputReg = MI.getOperand(0).getReg(); + MachineInstr *FirstMI = &*MBB->begin(); + if (InputReg.isVirtual()) { + MachineInstr *DefInstr = MRI->getVRegDef(InputReg); + assert(DefInstr && DefInstr->isCopy()); + if (DefInstr->getParent() == MBB) { + if (DefInstr != FirstMI) { + // If the `InputReg` is defined in current block, we also need to + // move that instruction to the beginning of the block. + DefInstr->removeFromParent(); + MBB->insert(FirstMI, DefInstr); + if (LIS) + LIS->handleMove(*DefInstr); + } else { + // If first instruction is definition then move pointer after it. + FirstMI = &*std::next(FirstMI->getIterator()); + } + } + } + + // Insert instruction sequence at block beginning (before vector operations). + const DebugLoc DL = MI.getDebugLoc(); + const unsigned WavefrontSize = ST->getWavefrontSize(); + const unsigned Mask = (WavefrontSize << 1) - 1; + Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass); + auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + auto BfmMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec) + .addReg(CountReg) + .addImm(0); + auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(WavefrontSize); + auto CmovMI = + BuildMI(*MBB, FirstMI, DL, + TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) + .addImm(-1); + + if (!LIS) { + MI.eraseFromParent(); + return; + } + + LIS->RemoveMachineInstrFromMaps(MI); + MI.eraseFromParent(); + + LIS->InsertMachineInstrInMaps(*BfeMI); + LIS->InsertMachineInstrInMaps(*BfmMI); + LIS->InsertMachineInstrInMaps(*CmpMI); + LIS->InsertMachineInstrInMaps(*CmovMI); + + LIS->removeInterval(InputReg); + LIS->createAndComputeVirtRegInterval(InputReg); + LIS->createAndComputeVirtRegInterval(CountReg); +} + +void SIWholeQuadMode::lowerInitExecInstrs() { + for (MachineInstr *MI : InitExecInstrs) + lowerInitExec(*MI); +} + bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName() << " ------------- \n"); @@ -1567,6 +1664,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { LowerToCopyInstrs.clear(); LowerToMovInstrs.clear(); KillInstrs.clear(); + InitExecInstrs.clear(); StateTransition.clear(); ST = &MF.getSubtarget<GCNSubtarget>(); @@ -1606,10 +1704,13 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { // Shader is simple does not need any state changes or any complex lowering if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() && KillInstrs.empty()) { + lowerInitExecInstrs(); lowerLiveMaskQueries(); - return !LiveMaskQueries.empty(); + return !InitExecInstrs.empty() || !LiveMaskQueries.empty(); } + lowerInitExecInstrs(); + MachineBasicBlock &Entry = MF.front(); MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI(); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp index 239e0ee..04c6e94 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp @@ -235,8 +235,9 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M) { } void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, - StringRef FnAttr) { - KernelRoot->removeFnAttr(FnAttr); + ArrayRef<StringRef> FnAttrs) { + for (StringRef Attr : FnAttrs) + KernelRoot->removeFnAttr(Attr); SmallVector<Function *> WorkList = {CG[KernelRoot]->getFunction()}; SmallPtrSet<Function *, 8> Visited; @@ -261,12 +262,15 @@ void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, Function *PotentialCallee = ExternalCallRecord.second->getFunction(); assert(PotentialCallee); - if (!isKernelLDS(PotentialCallee)) - PotentialCallee->removeFnAttr(FnAttr); + if (!isKernelLDS(PotentialCallee)) { + for (StringRef Attr : FnAttrs) + PotentialCallee->removeFnAttr(Attr); + } } } } else { - Callee->removeFnAttr(FnAttr); + for (StringRef Attr : FnAttrs) + Callee->removeFnAttr(Attr); if (Visited.insert(Callee).second) WorkList.push_back(Callee); } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h index 4d3ad32..e1cd4d0 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" @@ -54,7 +55,7 @@ LDSUsesInfoTy getTransitiveUsesOfLDS(const CallGraph &CG, Module &M); /// Strip FnAttr attribute from any functions where we may have /// introduced its use. void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, - StringRef FnAttr); + ArrayRef<StringRef> FnAttrs); /// Given a \p Def clobbering a load from \p Ptr according to the MSSA check /// if this is actually a memory update or an artificial clobber to facilitate diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp index a46c383..9198287 100644 --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -115,6 +115,12 @@ static bool shouldInspect(MachineInstr &MI) { return isDomainMVE(&MI) || isVectorPredicate(&MI) || hasVPRUse(MI); } +static bool isHorizontalReduction(const MachineInstr &MI) { + const MCInstrDesc &MCID = MI.getDesc(); + uint64_t Flags = MCID.TSFlags; + return (Flags & ARMII::HorizontalReduction) != 0; +} + namespace { using InstSet = SmallPtrSetImpl<MachineInstr *>; @@ -275,6 +281,16 @@ namespace { if (VPT->getOpcode() == ARM::MVE_VPST) return false; + // If the VPT block does not define something that is an "output", then + // the tail-predicated version will just perform a subset of the original + // vpt block, where the last lanes should not be used. + if (isVPTOpcode(VPT->getOpcode()) && + all_of(Block.getInsts(), [](const MachineInstr *MI) { + return !MI->mayStore() && !MI->mayLoad() && + !isHorizontalReduction(*MI) && !isVCTP(MI); + })) + return true; + auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx)); return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op); @@ -813,12 +829,6 @@ static bool producesDoubleWidthResult(const MachineInstr &MI) { return (Flags & ARMII::DoubleWidthResult) != 0; } -static bool isHorizontalReduction(const MachineInstr &MI) { - const MCInstrDesc &MCID = MI.getDesc(); - uint64_t Flags = MCID.TSFlags; - return (Flags & ARMII::HorizontalReduction) != 0; -} - // Can this instruction generate a non-zero result when given only zeroed // operands? This allows us to know that, given operands with false bytes // zeroed by masked loads, that the result will also contain zeros in those diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 51384f2..9d7e463 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -171,6 +171,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // Set operations for 'F' feature. if (Subtarget.hasBasicF()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); + setTruncStoreAction(MVT::f32, MVT::f16, Expand); setCondCodeAction(FPCCToExpand, MVT::f32, Expand); setOperationAction(ISD::SELECT_CC, MVT::f32, Expand); @@ -186,6 +188,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); setOperationAction(ISD::FPOW, MVT::f32, Expand); setOperationAction(ISD::FREM, MVT::f32, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f32, Legal); @@ -202,7 +206,9 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, // Set operations for 'D' feature. if (Subtarget.hasBasicD()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); + setTruncStoreAction(MVT::f64, MVT::f16, Expand); setTruncStoreAction(MVT::f64, MVT::f32, Expand); setCondCodeAction(FPCCToExpand, MVT::f64, Expand); @@ -219,6 +225,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FPOW, MVT::f64, Expand); setOperationAction(ISD::FREM, MVT::f64, Expand); + setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); + setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand); if (Subtarget.is64Bit()) setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -5004,6 +5012,10 @@ bool LoongArchTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64; } +bool LoongArchTargetLowering::signExtendConstant(const ConstantInt *CI) const { + return Subtarget.is64Bit() && CI->getType()->isIntegerTy(32); +} + bool LoongArchTargetLowering::hasAndNotCompare(SDValue Y) const { // TODO: Support vectors. if (Y.getValueType().isVector()) diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h index f274b19..9328831 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h @@ -229,6 +229,7 @@ public: bool isLegalAddImmediate(int64_t Imm) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override; + bool signExtendConstant(const ConstantInt *CI) const override; bool hasAndNotCompare(SDValue Y) const override; diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp index 83466d5..c29c1b5 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp @@ -46,7 +46,7 @@ static cl::opt<bool> static std::string computeDataLayout(const Triple &TT) { if (TT.isArch64Bit()) - return "e-m:e-p:64:64-i64:64-i128:128-n64-S128"; + return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"; assert(TT.isArch32Bit() && "only LA32 and LA64 are currently supported"); return "e-m:e-p:32:32-i64:64-n32-S128"; } diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 5eefab5..b0cb24c 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -40,7 +40,7 @@ FunctionPass *createNVPTXISelDag(NVPTXTargetMachine &TM, ModulePass *createNVPTXAssignValidGlobalNamesPass(); ModulePass *createGenericToNVVMLegacyPass(); ModulePass *createNVPTXCtorDtorLoweringLegacyPass(); -FunctionPass *createNVVMIntrRangePass(unsigned int SmVersion); +FunctionPass *createNVVMIntrRangePass(); FunctionPass *createNVVMReflectPass(unsigned int SmVersion); MachineFunctionPass *createNVPTXPrologEpilogPass(); MachineFunctionPass *createNVPTXReplaceImageHandlesPass(); @@ -53,12 +53,7 @@ MachineFunctionPass *createNVPTXPeephole(); MachineFunctionPass *createNVPTXProxyRegErasurePass(); struct NVVMIntrRangePass : PassInfoMixin<NVVMIntrRangePass> { - NVVMIntrRangePass(); - NVVMIntrRangePass(unsigned SmVersion) : SmVersion(SmVersion) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); - -private: - unsigned SmVersion; }; struct NVVMReflectPass : PassInfoMixin<NVVMReflectPass> { diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index f636979..82770f8 100644 --- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -542,30 +542,24 @@ void NVPTXAsmPrinter::emitKernelFunctionDirectives(const Function &F, // If the NVVM IR has some of reqntid* specified, then output // the reqntid directive, and set the unspecified ones to 1. // If none of Reqntid* is specified, don't output reqntid directive. - unsigned Reqntidx, Reqntidy, Reqntidz; - Reqntidx = Reqntidy = Reqntidz = 1; - bool ReqSpecified = false; - ReqSpecified |= getReqNTIDx(F, Reqntidx); - ReqSpecified |= getReqNTIDy(F, Reqntidy); - ReqSpecified |= getReqNTIDz(F, Reqntidz); + std::optional<unsigned> Reqntidx = getReqNTIDx(F); + std::optional<unsigned> Reqntidy = getReqNTIDy(F); + std::optional<unsigned> Reqntidz = getReqNTIDz(F); - if (ReqSpecified) - O << ".reqntid " << Reqntidx << ", " << Reqntidy << ", " << Reqntidz - << "\n"; + if (Reqntidx || Reqntidy || Reqntidz) + O << ".reqntid " << Reqntidx.value_or(1) << ", " << Reqntidy.value_or(1) + << ", " << Reqntidz.value_or(1) << "\n"; // If the NVVM IR has some of maxntid* specified, then output // the maxntid directive, and set the unspecified ones to 1. // If none of maxntid* is specified, don't output maxntid directive. - unsigned Maxntidx, Maxntidy, Maxntidz; - Maxntidx = Maxntidy = Maxntidz = 1; - bool MaxSpecified = false; - MaxSpecified |= getMaxNTIDx(F, Maxntidx); - MaxSpecified |= getMaxNTIDy(F, Maxntidy); - MaxSpecified |= getMaxNTIDz(F, Maxntidz); - - if (MaxSpecified) - O << ".maxntid " << Maxntidx << ", " << Maxntidy << ", " << Maxntidz - << "\n"; + std::optional<unsigned> Maxntidx = getMaxNTIDx(F); + std::optional<unsigned> Maxntidy = getMaxNTIDy(F); + std::optional<unsigned> Maxntidz = getMaxNTIDz(F); + + if (Maxntidx || Maxntidy || Maxntidz) + O << ".maxntid " << Maxntidx.value_or(1) << ", " << Maxntidy.value_or(1) + << ", " << Maxntidz.value_or(1) << "\n"; unsigned Mincta = 0; if (getMinCTASm(F, Mincta)) diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 4dc3cea..b60a1d7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -233,9 +233,9 @@ void NVPTXTargetMachine::registerPassBuilderCallbacks( [this](ModulePassManager &PM, OptimizationLevel Level) { FunctionPassManager FPM; FPM.addPass(NVVMReflectPass(Subtarget.getSmVersion())); - // FIXME: NVVMIntrRangePass is causing numerical discrepancies, - // investigate and re-enable. - // FPM.addPass(NVVMIntrRangePass(Subtarget.getSmVersion())); + // Note: NVVMIntrRangePass was causing numerical discrepancies at one + // point, if issues crop up, consider disabling. + FPM.addPass(NVVMIntrRangePass()); PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM))); }); } diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp index 013afe9..3a536db 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp @@ -128,6 +128,14 @@ bool findOneNVVMAnnotation(const GlobalValue *gv, const std::string &prop, return true; } +static std::optional<unsigned> +findOneNVVMAnnotation(const GlobalValue &GV, const std::string &PropName) { + unsigned RetVal; + if (findOneNVVMAnnotation(&GV, PropName, RetVal)) + return RetVal; + return std::nullopt; +} + bool findAllNVVMAnnotation(const GlobalValue *gv, const std::string &prop, std::vector<unsigned> &retval) { auto &AC = getAnnotationCache(); @@ -252,32 +260,57 @@ std::string getSamplerName(const Value &val) { return std::string(val.getName()); } -bool getMaxNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "maxntidx", x); +std::optional<unsigned> getMaxNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidx"); } -bool getMaxNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "maxntidy", y); +std::optional<unsigned> getMaxNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidy"); } -bool getMaxNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "maxntidz", z); +std::optional<unsigned> getMaxNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "maxntidz"); +} + +std::optional<unsigned> getMaxNTID(const Function &F) { + // Note: The semantics here are a bit strange. The PTX ISA states the + // following (11.4.2. Performance-Tuning Directives: .maxntid): + // + // Note that this directive guarantees that the total number of threads does + // not exceed the maximum, but does not guarantee that the limit in any + // particular dimension is not exceeded. + std::optional<unsigned> MaxNTIDx = getMaxNTIDx(F); + std::optional<unsigned> MaxNTIDy = getMaxNTIDy(F); + std::optional<unsigned> MaxNTIDz = getMaxNTIDz(F); + if (MaxNTIDx || MaxNTIDy || MaxNTIDz) + return MaxNTIDx.value_or(1) * MaxNTIDy.value_or(1) * MaxNTIDz.value_or(1); + return std::nullopt; } bool getMaxClusterRank(const Function &F, unsigned &x) { return findOneNVVMAnnotation(&F, "maxclusterrank", x); } -bool getReqNTIDx(const Function &F, unsigned &x) { - return findOneNVVMAnnotation(&F, "reqntidx", x); +std::optional<unsigned> getReqNTIDx(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidx"); +} + +std::optional<unsigned> getReqNTIDy(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidy"); } -bool getReqNTIDy(const Function &F, unsigned &y) { - return findOneNVVMAnnotation(&F, "reqntidy", y); +std::optional<unsigned> getReqNTIDz(const Function &F) { + return findOneNVVMAnnotation(F, "reqntidz"); } -bool getReqNTIDz(const Function &F, unsigned &z) { - return findOneNVVMAnnotation(&F, "reqntidz", z); +std::optional<unsigned> getReqNTID(const Function &F) { + // Note: The semantics here are a bit strange. See getMaxNTID. + std::optional<unsigned> ReqNTIDx = getReqNTIDx(F); + std::optional<unsigned> ReqNTIDy = getReqNTIDy(F); + std::optional<unsigned> ReqNTIDz = getReqNTIDz(F); + if (ReqNTIDx || ReqNTIDy || ReqNTIDz) + return ReqNTIDx.value_or(1) * ReqNTIDy.value_or(1) * ReqNTIDz.value_or(1); + return std::nullopt; } bool getMinCTASm(const Function &F, unsigned &x) { diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.h b/llvm/lib/Target/NVPTX/NVPTXUtilities.h index 2872db9..e020bc0 100644 --- a/llvm/lib/Target/NVPTX/NVPTXUtilities.h +++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.h @@ -48,13 +48,15 @@ std::string getTextureName(const Value &); std::string getSurfaceName(const Value &); std::string getSamplerName(const Value &); -bool getMaxNTIDx(const Function &, unsigned &); -bool getMaxNTIDy(const Function &, unsigned &); -bool getMaxNTIDz(const Function &, unsigned &); - -bool getReqNTIDx(const Function &, unsigned &); -bool getReqNTIDy(const Function &, unsigned &); -bool getReqNTIDz(const Function &, unsigned &); +std::optional<unsigned> getMaxNTIDx(const Function &); +std::optional<unsigned> getMaxNTIDy(const Function &); +std::optional<unsigned> getMaxNTIDz(const Function &); +std::optional<unsigned> getMaxNTID(const Function &F); + +std::optional<unsigned> getReqNTIDx(const Function &); +std::optional<unsigned> getReqNTIDy(const Function &); +std::optional<unsigned> getReqNTIDz(const Function &); +std::optional<unsigned> getReqNTID(const Function &); bool getMaxClusterRank(const Function &, unsigned &); bool getMinCTASm(const Function &, unsigned &); diff --git a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp index 5381646..f9d21b3 100644 --- a/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp +++ b/llvm/lib/Target/NVPTX/NVVMIntrRange.cpp @@ -1,4 +1,4 @@ -//===- NVVMIntrRange.cpp - Set !range metadata for NVVM intrinsics --------===// +//===- NVVMIntrRange.cpp - Set range attributes for NVVM intrinsics -------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,19 +6,21 @@ // //===----------------------------------------------------------------------===// // -// This pass adds appropriate !range metadata for calls to NVVM +// This pass adds appropriate range attributes for calls to NVVM // intrinsics that return a limited range of values. // //===----------------------------------------------------------------------===// #include "NVPTX.h" -#include "llvm/IR/Constants.h" +#include "NVPTXUtilities.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsNVPTX.h" #include "llvm/IR/PassManager.h" #include "llvm/Support/CommandLine.h" +#include <cstdint> using namespace llvm; @@ -26,31 +28,20 @@ using namespace llvm; namespace llvm { void initializeNVVMIntrRangePass(PassRegistry &); } -// Add !range metadata based on limits of given SM variant. -static cl::opt<unsigned> NVVMIntrRangeSM("nvvm-intr-range-sm", cl::init(20), - cl::Hidden, cl::desc("SM variant")); - namespace { class NVVMIntrRange : public FunctionPass { - private: - unsigned SmVersion; - - public: - static char ID; - NVVMIntrRange() : NVVMIntrRange(NVVMIntrRangeSM) {} - NVVMIntrRange(unsigned int SmVersion) - : FunctionPass(ID), SmVersion(SmVersion) { +public: + static char ID; + NVVMIntrRange() : FunctionPass(ID) { - initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); - } + initializeNVVMIntrRangePass(*PassRegistry::getPassRegistry()); + } - bool runOnFunction(Function &) override; + bool runOnFunction(Function &) override; }; -} +} // namespace -FunctionPass *llvm::createNVVMIntrRangePass(unsigned int SmVersion) { - return new NVVMIntrRange(SmVersion); -} +FunctionPass *llvm::createNVVMIntrRangePass() { return new NVVMIntrRange(); } char NVVMIntrRange::ID = 0; INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", @@ -58,112 +49,110 @@ INITIALIZE_PASS(NVVMIntrRange, "nvvm-intr-range", // Adds the passed-in [Low,High) range information as metadata to the // passed-in call instruction. -static bool addRangeMetadata(uint64_t Low, uint64_t High, CallInst *C) { - // This call already has range metadata, nothing to do. - if (C->getMetadata(LLVMContext::MD_range)) +static bool addRangeAttr(uint64_t Low, uint64_t High, IntrinsicInst *II) { + if (II->getMetadata(LLVMContext::MD_range)) return false; - LLVMContext &Context = C->getParent()->getContext(); - IntegerType *Int32Ty = Type::getInt32Ty(Context); - Metadata *LowAndHigh[] = { - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Low)), - ConstantAsMetadata::get(ConstantInt::get(Int32Ty, High))}; - C->setMetadata(LLVMContext::MD_range, MDNode::get(Context, LowAndHigh)); + const uint64_t BitWidth = II->getType()->getIntegerBitWidth(); + ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High)); + + if (auto CurrentRange = II->getRange()) + Range = Range.intersectWith(CurrentRange.value()); + + II->addRangeRetAttr(Range); return true; } -static bool runNVVMIntrRange(Function &F, unsigned SmVersion) { +static bool runNVVMIntrRange(Function &F) { struct { unsigned x, y, z; } MaxBlockSize, MaxGridSize; - MaxBlockSize.x = 1024; - MaxBlockSize.y = 1024; - MaxBlockSize.z = 64; - MaxGridSize.x = SmVersion >= 30 ? 0x7fffffff : 0xffff; + const unsigned MetadataNTID = getReqNTID(F).value_or( + getMaxNTID(F).value_or(std::numeric_limits<unsigned>::max())); + + MaxBlockSize.x = std::min(1024u, MetadataNTID); + MaxBlockSize.y = std::min(1024u, MetadataNTID); + MaxBlockSize.z = std::min(64u, MetadataNTID); + + MaxGridSize.x = 0x7fffffff; MaxGridSize.y = 0xffff; MaxGridSize.z = 0xffff; // Go through the calls in this function. bool Changed = false; for (Instruction &I : instructions(F)) { - CallInst *Call = dyn_cast<CallInst>(&I); - if (!Call) + IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I); + if (!II) continue; - if (Function *Callee = Call->getCalledFunction()) { - switch (Callee->getIntrinsicID()) { - // Index within block - case Intrinsic::nvvm_read_ptx_sreg_tid_x: - Changed |= addRangeMetadata(0, MaxBlockSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_y: - Changed |= addRangeMetadata(0, MaxBlockSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_tid_z: - Changed |= addRangeMetadata(0, MaxBlockSize.z, Call); - break; - - // Block size - case Intrinsic::nvvm_read_ptx_sreg_ntid_x: - Changed |= addRangeMetadata(1, MaxBlockSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_y: - Changed |= addRangeMetadata(1, MaxBlockSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ntid_z: - Changed |= addRangeMetadata(1, MaxBlockSize.z+1, Call); - break; - - // Index within grid - case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: - Changed |= addRangeMetadata(0, MaxGridSize.x, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: - Changed |= addRangeMetadata(0, MaxGridSize.y, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: - Changed |= addRangeMetadata(0, MaxGridSize.z, Call); - break; - - // Grid size - case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: - Changed |= addRangeMetadata(1, MaxGridSize.x+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: - Changed |= addRangeMetadata(1, MaxGridSize.y+1, Call); - break; - case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: - Changed |= addRangeMetadata(1, MaxGridSize.z+1, Call); - break; - - // warp size is constant 32. - case Intrinsic::nvvm_read_ptx_sreg_warpsize: - Changed |= addRangeMetadata(32, 32+1, Call); - break; - - // Lane ID is [0..warpsize) - case Intrinsic::nvvm_read_ptx_sreg_laneid: - Changed |= addRangeMetadata(0, 32, Call); - break; - - default: - break; - } + switch (II->getIntrinsicID()) { + // Index within block + case Intrinsic::nvvm_read_ptx_sreg_tid_x: + Changed |= addRangeAttr(0, MaxBlockSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_y: + Changed |= addRangeAttr(0, MaxBlockSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_tid_z: + Changed |= addRangeAttr(0, MaxBlockSize.z, II); + break; + + // Block size + case Intrinsic::nvvm_read_ptx_sreg_ntid_x: + Changed |= addRangeAttr(1, MaxBlockSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_y: + Changed |= addRangeAttr(1, MaxBlockSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ntid_z: + Changed |= addRangeAttr(1, MaxBlockSize.z + 1, II); + break; + + // Index within grid + case Intrinsic::nvvm_read_ptx_sreg_ctaid_x: + Changed |= addRangeAttr(0, MaxGridSize.x, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_y: + Changed |= addRangeAttr(0, MaxGridSize.y, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_ctaid_z: + Changed |= addRangeAttr(0, MaxGridSize.z, II); + break; + + // Grid size + case Intrinsic::nvvm_read_ptx_sreg_nctaid_x: + Changed |= addRangeAttr(1, MaxGridSize.x + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_y: + Changed |= addRangeAttr(1, MaxGridSize.y + 1, II); + break; + case Intrinsic::nvvm_read_ptx_sreg_nctaid_z: + Changed |= addRangeAttr(1, MaxGridSize.z + 1, II); + break; + + // warp size is constant 32. + case Intrinsic::nvvm_read_ptx_sreg_warpsize: + Changed |= addRangeAttr(32, 32 + 1, II); + break; + + // Lane ID is [0..warpsize) + case Intrinsic::nvvm_read_ptx_sreg_laneid: + Changed |= addRangeAttr(0, 32, II); + break; + + default: + break; } } return Changed; } -bool NVVMIntrRange::runOnFunction(Function &F) { - return runNVVMIntrRange(F, SmVersion); -} - -NVVMIntrRangePass::NVVMIntrRangePass() : NVVMIntrRangePass(NVVMIntrRangeSM) {} +bool NVVMIntrRange::runOnFunction(Function &F) { return runNVVMIntrRange(F); } PreservedAnalyses NVVMIntrRangePass::run(Function &F, FunctionAnalysisManager &AM) { - return runNVVMIntrRange(F, SmVersion) ? PreservedAnalyses::none() - : PreservedAnalyses::all(); + return runNVVMIntrRange(F) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); } diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp index f4e84ad..bc0ae7a 100644 --- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp +++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp @@ -1079,13 +1079,13 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { assert(IsAIX && TM.getCodeModel() == CodeModel::Small && "PseudoOp only valid for small code model AIX"); - // Transform %rN = ADDItoc/8 @op1, %r2. + // Transform %rN = ADDItoc/8 %r2, @op1. LowerPPCMachineInstrToMCInst(MI, TmpInst, *this); // Change the opcode to load address. TmpInst.setOpcode((!IsPPC64) ? (PPC::LA) : (PPC::LA8)); - const MachineOperand &MO = MI->getOperand(1); + const MachineOperand &MO = MI->getOperand(2); assert(MO.isGlobal() && "Invalid operand for ADDItoc[8]."); // Map the operand to its corresponding MCSymbol. @@ -1094,7 +1094,6 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) { const MCExpr *Exp = MCSymbolRefExpr::create(MOSymbol, MCSymbolRefExpr::VK_None, OutContext); - TmpInst.getOperand(1) = TmpInst.getOperand(2); TmpInst.getOperand(2) = MCOperand::createExpr(Exp); EmitToStreamer(*OutStreamer, TmpInst); return; diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp index 7350506..a07954bd 100644 --- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp +++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp @@ -2080,13 +2080,15 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) { cast<GlobalVariable>(GV)->hasAttribute("toc-data"); // For small code model, generate a simple TOC load. - if (CModel == CodeModel::Small) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, - IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), - DestReg) - .addGlobalAddress(GV) - .addReg(PPC::X2); - else { + if (CModel == CodeModel::Small) { + auto MIB = BuildMI( + *FuncInfo.MBB, FuncInfo.InsertPt, MIMD, + IsAIXTocData ? TII.get(PPC::ADDItoc8) : TII.get(PPC::LDtoc), DestReg); + if (IsAIXTocData) + MIB.addReg(PPC::X2).addGlobalAddress(GV); + else + MIB.addGlobalAddress(GV).addReg(PPC::X2); + } else { // If the address is an externally defined symbol, a symbol with common // or externally available linkage, a non-local function address, or a // jump table address (not yet needed), or if we are generating code diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 275b333..1a69d1e 100644 --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -6102,8 +6102,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) { EVT OperandTy) { SDValue GA = TocEntry->getOperand(0); SDValue TocBase = TocEntry->getOperand(1); - SDNode *MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase); - transferMemOperands(TocEntry, MN); + SDNode *MN = nullptr; + if (OpCode == PPC::ADDItoc || OpCode == PPC::ADDItoc8) + // toc-data access doesn't involve in loading from got, no need to + // keep memory operands. + MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, TocBase, GA); + else { + MN = CurDAG->getMachineNode(OpCode, dl, OperandTy, GA, TocBase); + transferMemOperands(TocEntry, MN); + } ReplaceNode(TocEntry, MN); }; diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 9af8ada7..eda5eb9 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -1485,11 +1485,9 @@ def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry: } // Local Data Transform -def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), +def ADDItoc8 : PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp), "#ADDItoc8", - [(set i64:$rD, - (PPCtoc_entry tglobaladdr:$disp, i64:$reg))]>, isPPC64; - + []>, isPPC64; let mayLoad = 1 in def LDtocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins tocentry:$disp, g8rc_nox0:$reg), "#LDtocL", []>, isPPC64; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index df6b2bf..09f8299 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -3345,10 +3345,8 @@ def LWZtocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc_nor def ADDIStocHA : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), "#ADDIStocHA", []>; // TOC Data Transform on AIX -def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins tocentry32:$disp, gprc:$reg), - "#ADDItoc", - [(set i32:$rD, - (PPCtoc_entry tglobaladdr:$disp, i32:$reg))]>; +def ADDItoc : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc:$reg, tocentry32:$disp), + "#ADDItoc", []>; def ADDItocL : PPCEmitTimePseudo<(outs gprc:$rD), (ins gprc_nor0:$reg, tocentry32:$disp), "#ADDItocL", []>; diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp index a967682..82358cd 100644 --- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp +++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp @@ -932,11 +932,11 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const { "Can't handle X0, X0 vsetvli yet"); if (AVLReg == RISCV::X0) NewInfo.setAVLVLMAX(); - else if (VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS)) - NewInfo.setAVLRegDef(VNI, AVLReg); - else { - assert(MI.getOperand(1).isUndef()); + else if (MI.getOperand(1).isUndef()) NewInfo.setAVLIgnored(); + else { + VNInfo *VNI = getVNInfoFromReg(AVLReg, MI, LIS); + NewInfo.setAVLRegDef(VNI, AVLReg); } } NewInfo.setVTYPE(MI.getOperand(2).getImm()); @@ -1008,11 +1008,11 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const { } else InstrInfo.setAVLImm(Imm); - } else if (VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS)) { - InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); - } else { - assert(VLOp.isUndef()); + } else if (VLOp.isUndef()) { InstrInfo.setAVLIgnored(); + } else { + VNInfo *VNI = getVNInfoFromReg(VLOp.getReg(), MI, LIS); + InstrInfo.setAVLRegDef(VNI, VLOp.getReg()); } } else { assert(isScalarExtractInstr(MI)); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index 6d926ce..b0949f5 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -1033,6 +1033,22 @@ class VPseudoUnaryNoMask<DAGOperand RetClass, let HasVecPolicyOp = 1; } +class VPseudoUnaryNoMaskNoPolicy<DAGOperand RetClass, + DAGOperand OpClass, + string Constraint = "", + int TargetConstraintType = 1> : + Pseudo<(outs RetClass:$rd), + (ins OpClass:$rs2, AVL:$vl, ixlenimm:$sew), []>, + RISCVVPseudo { + let mayLoad = 0; + let mayStore = 0; + let hasSideEffects = 0; + let Constraints = Constraint; + let TargetOverlapConstraintType = TargetConstraintType; + let HasVLOp = 1; + let HasSEWOp = 1; +} + class VPseudoUnaryNoMaskRoundingMode<DAGOperand RetClass, DAGOperand OpClass, string Constraint = "", @@ -1422,24 +1438,6 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass, let UsesVXRM = 0; } -// Like VPseudoBinaryNoMask, but output can be V0. -class VPseudoBinaryMOutNoMask<VReg RetClass, - VReg Op1Class, - DAGOperand Op2Class, - string Constraint, - int TargetConstraintType = 1> : - Pseudo<(outs RetClass:$rd), - (ins Op1Class:$rs2, Op2Class:$rs1, AVL:$vl, ixlenimm:$sew), []>, - RISCVVPseudo { - let mayLoad = 0; - let mayStore = 0; - let hasSideEffects = 0; - let Constraints = Constraint; - let TargetOverlapConstraintType = TargetConstraintType; - let HasVLOp = 1; - let HasSEWOp = 1; -} - // Like VPseudoBinaryMask, but output can be V0. class VPseudoBinaryMOutMask<VReg RetClass, RegisterClass Op1Class, @@ -2056,9 +2054,10 @@ multiclass VPseudoVSFS_M { foreach mti = AllMasks in { defvar mx = mti.LMul.MX; let VLMul = mti.LMul.value in { - def "_M_" # mti.BX : VPseudoUnaryNoMask<VR, VR, constraint>, + def "_M_" # mti.BX : VPseudoUnaryNoMaskNoPolicy<VR, VR, constraint>, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forceMergeOpRead=true>; + let ForceTailAgnostic = true in def "_M_" # mti.BX # "_MASK" : VPseudoUnaryMask<VR, VR, constraint>, SchedUnary<"WriteVMSFSV", "ReadVMSFSV", mx, forceMergeOpRead=true>; @@ -2172,8 +2171,8 @@ multiclass VPseudoBinaryM<VReg RetClass, int TargetConstraintType = 1, bit Commutable = 0> { let VLMul = MInfo.value, isCommutable = Commutable in { - def "_" # MInfo.MX : VPseudoBinaryMOutNoMask<RetClass, Op1Class, Op2Class, - Constraint, TargetConstraintType>; + def "_" # MInfo.MX : VPseudoBinaryNoMask<RetClass, Op1Class, Op2Class, + Constraint, TargetConstraintType>; let ForceTailAgnostic = true in def "_" # MInfo.MX # "_MASK" : VPseudoBinaryMOutMask<RetClass, Op1Class, Op2Class, Constraint, TargetConstraintType>, @@ -4078,9 +4077,8 @@ class VPatMaskUnaryNoMask<string intrinsic_name, (mti.Mask VR:$rs2), VLOpFrag)), (!cast<Instruction>(inst#"_M_"#mti.BX) - (mti.Mask (IMPLICIT_DEF)), (mti.Mask VR:$rs2), - GPR:$vl, mti.Log2SEW, TA_MA)>; + GPR:$vl, mti.Log2SEW)>; class VPatMaskUnaryMask<string intrinsic_name, string inst, @@ -4153,27 +4151,6 @@ class VPatBinaryNoMaskTU<string intrinsic_name, (op2_type op2_kind:$rs2), GPR:$vl, sew, TU_MU)>; -class VPatBinaryNoMaskRoundingMode<string intrinsic_name, - string inst, - ValueType result_type, - ValueType op1_type, - ValueType op2_type, - int sew, - VReg op1_reg_class, - DAGOperand op2_kind> : - Pat<(result_type (!cast<Intrinsic>(intrinsic_name) - (result_type (undef)), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - (XLenVT timm:$round), - VLOpFrag)), - (!cast<Instruction>(inst) - (result_type (IMPLICIT_DEF)), - (op1_type op1_reg_class:$rs1), - (op2_type op2_kind:$rs2), - (XLenVT timm:$round), - GPR:$vl, sew, TA_MA)>; - class VPatBinaryNoMaskTURoundingMode<string intrinsic_name, string inst, ValueType result_type, @@ -4827,8 +4804,6 @@ multiclass VPatBinaryRoundingMode<string intrinsic, VReg result_reg_class, VReg op1_reg_class, DAGOperand op2_kind> { - def : VPatBinaryNoMaskRoundingMode<intrinsic, inst, result_type, op1_type, op2_type, - sew, op1_reg_class, op2_kind>; def : VPatBinaryNoMaskTURoundingMode<intrinsic, inst, result_type, op1_type, op2_type, sew, result_reg_class, op1_reg_class, op2_kind>; def : VPatBinaryMaskTARoundingMode<intrinsic, inst, result_type, op1_type, op2_type, @@ -6962,12 +6937,12 @@ defm : VPatBinaryV_VV_VX_VI<"int_riscv_vsra", "PseudoVSRA", AllIntegerVectors, foreach vti = AllIntegerVectors in { // Emit shift by 1 as an add since it might be faster. let Predicates = GetVTypePredicates<vti>.Predicates in { - def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector undef), + def : Pat<(vti.Vector (int_riscv_vsll (vti.Vector vti.RegClass:$merge), (vti.Vector vti.RegClass:$rs1), (XLenVT 1), VLOpFrag)), (!cast<Instruction>("PseudoVADD_VV_"#vti.LMul.MX) - (vti.Vector (IMPLICIT_DEF)), vti.RegClass:$rs1, - vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TA_MA)>; + vti.RegClass:$merge, vti.RegClass:$rs1, + vti.RegClass:$rs1, GPR:$vl, vti.Log2SEW, TU_MU)>; def : Pat<(vti.Vector (int_riscv_vsll_mask (vti.Vector vti.RegClass:$merge), (vti.Vector vti.RegClass:$rs1), (XLenVT 1), diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp index 956b851..49838e6 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp @@ -1459,11 +1459,22 @@ static bool generateImageSizeQueryInst(const SPIRV::IncomingCall *Call, Component == 3 ? NumActualRetComponents - 1 : Component; assert(ExtractedComposite < NumActualRetComponents && "Invalid composite index!"); + Register TypeReg = GR->getSPIRVTypeID(Call->ReturnType); + SPIRVType *NewType = nullptr; + if (QueryResultType->getOpcode() == SPIRV::OpTypeVector) { + Register NewTypeReg = QueryResultType->getOperand(1).getReg(); + if (TypeReg != NewTypeReg && + (NewType = GR->getSPIRVTypeForVReg(NewTypeReg)) != nullptr) + TypeReg = NewTypeReg; + } MIRBuilder.buildInstr(SPIRV::OpCompositeExtract) .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) + .addUse(TypeReg) .addUse(QueryResult) .addImm(ExtractedComposite); + if (NewType != nullptr) + insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder, + MIRBuilder.getMF().getRegInfo()); } else { // More than 1 component is expected, fill a new vector. auto MIB = MIRBuilder.buildInstr(SPIRV::OpVectorShuffle) @@ -2063,16 +2074,30 @@ static bool generateAsyncCopy(const SPIRV::IncomingCall *Call, auto Scope = buildConstantIntReg(SPIRV::Scope::Workgroup, MIRBuilder, GR); switch (Opcode) { - case SPIRV::OpGroupAsyncCopy: - return MIRBuilder.buildInstr(Opcode) - .addDef(Call->ReturnRegister) - .addUse(GR->getSPIRVTypeID(Call->ReturnType)) - .addUse(Scope) - .addUse(Call->Arguments[0]) - .addUse(Call->Arguments[1]) - .addUse(Call->Arguments[2]) - .addUse(buildConstantIntReg(1, MIRBuilder, GR)) - .addUse(Call->Arguments[3]); + case SPIRV::OpGroupAsyncCopy: { + SPIRVType *NewType = + Call->ReturnType->getOpcode() == SPIRV::OpTypeEvent + ? nullptr + : GR->getOrCreateSPIRVTypeByName("spirv.Event", MIRBuilder); + Register TypeReg = GR->getSPIRVTypeID(NewType ? NewType : Call->ReturnType); + unsigned NumArgs = Call->Arguments.size(); + Register EventReg = Call->Arguments[NumArgs - 1]; + bool Res = MIRBuilder.buildInstr(Opcode) + .addDef(Call->ReturnRegister) + .addUse(TypeReg) + .addUse(Scope) + .addUse(Call->Arguments[0]) + .addUse(Call->Arguments[1]) + .addUse(Call->Arguments[2]) + .addUse(Call->Arguments.size() > 4 + ? Call->Arguments[3] + : buildConstantIntReg(1, MIRBuilder, GR)) + .addUse(EventReg); + if (NewType != nullptr) + insertAssignInstr(Call->ReturnRegister, nullptr, NewType, GR, MIRBuilder, + MIRBuilder.getMF().getRegInfo()); + return Res; + } case SPIRV::OpGroupWaitEvents: return MIRBuilder.buildInstr(Opcode) .addUse(Scope) diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td index 24c6c26..edc9e1a 100644 --- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td +++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td @@ -586,6 +586,7 @@ defm : DemangledNativeBuiltin<"__spirv_SpecConstantComposite", OpenCL_std, SpecC // Async Copy and Prefetch builtin records: defm : DemangledNativeBuiltin<"async_work_group_copy", OpenCL_std, AsyncCopy, 4, 4, OpGroupAsyncCopy>; +defm : DemangledNativeBuiltin<"async_work_group_strided_copy", OpenCL_std, AsyncCopy, 5, 5, OpGroupAsyncCopy>; defm : DemangledNativeBuiltin<"__spirv_GroupAsyncCopy", OpenCL_std, AsyncCopy, 6, 6, OpGroupAsyncCopy>; defm : DemangledNativeBuiltin<"wait_group_events", OpenCL_std, AsyncCopy, 2, 2, OpGroupWaitEvents>; defm : DemangledNativeBuiltin<"__spirv_GroupWaitEvents", OpenCL_std, AsyncCopy, 3, 3, OpGroupWaitEvents>; diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp index 5ef0be1..bbd25dc 100644 --- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp @@ -61,9 +61,6 @@ class SPIRVEmitIntrinsics DenseMap<Instruction *, Type *> AggrConstTypes; DenseSet<Instruction *> AggrStores; - // a registry of created Intrinsic::spv_assign_ptr_type instructions - DenseMap<Value *, CallInst *> AssignPtrTypeInstr; - // deduce element type of untyped pointers Type *deduceElementType(Value *I); Type *deduceElementTypeHelper(Value *I); @@ -98,14 +95,16 @@ class SPIRVEmitIntrinsics return B.CreateIntrinsic(IntrID, {Types}, Args); } + void buildAssignType(IRBuilder<> &B, Type *ElemTy, Value *Arg); void buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg); + void updateAssignType(CallInst *AssignCI, Value *Arg, Value *OfType); void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B); void processInstrAfterVisit(Instruction *I, IRBuilder<> &B); void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B); void insertAssignTypeIntrs(Instruction *I, IRBuilder<> &B); - void insertAssignTypeInstrForTargetExtTypes(TargetExtType *AssignedType, - Value *V, IRBuilder<> &B); + void insertAssignPtrTypeTargetExt(TargetExtType *AssignedType, Value *V, + IRBuilder<> &B); void replacePointerOperandWithPtrCast(Instruction *I, Value *Pointer, Type *ExpectedElementType, unsigned OperandToReplace, @@ -218,15 +217,39 @@ static inline void reportFatalOnTokenType(const Instruction *I) { false); } +void SPIRVEmitIntrinsics::buildAssignType(IRBuilder<> &B, Type *Ty, + Value *Arg) { + Value *OfType = PoisonValue::get(Ty); + CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, + {Arg->getType()}, OfType, Arg, {}, B); + GR->addAssignPtrTypeInstr(Arg, AssignCI); +} + void SPIRVEmitIntrinsics::buildAssignPtr(IRBuilder<> &B, Type *ElemTy, Value *Arg) { - CallInst *AssignPtrTyCI = - buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Arg->getType()}, - Constant::getNullValue(ElemTy), Arg, - {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B); + Value *OfType = PoisonValue::get(ElemTy); + CallInst *AssignPtrTyCI = buildIntrWithMD( + Intrinsic::spv_assign_ptr_type, {Arg->getType()}, OfType, Arg, + {B.getInt32(getPointerAddressSpace(Arg->getType()))}, B); GR->addDeducedElementType(AssignPtrTyCI, ElemTy); GR->addDeducedElementType(Arg, ElemTy); - AssignPtrTypeInstr[Arg] = AssignPtrTyCI; + GR->addAssignPtrTypeInstr(Arg, AssignPtrTyCI); +} + +void SPIRVEmitIntrinsics::updateAssignType(CallInst *AssignCI, Value *Arg, + Value *OfType) { + LLVMContext &Ctx = Arg->getContext(); + AssignCI->setArgOperand( + 1, MetadataAsValue::get( + Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OfType)))); + if (cast<IntrinsicInst>(AssignCI)->getIntrinsicID() != + Intrinsic::spv_assign_ptr_type) + return; + + // update association with the pointee type + Type *ElemTy = OfType->getType(); + GR->addDeducedElementType(AssignCI, ElemTy); + GR->addDeducedElementType(Arg, ElemTy); } // Set element pointer type to the given value of ValueTy and tries to @@ -513,19 +536,16 @@ void SPIRVEmitIntrinsics::deduceOperandElementType(Instruction *I) { if (!Ty) { GR->addDeducedElementType(Op, KnownElemTy); // check if there is existing Intrinsic::spv_assign_ptr_type instruction - auto It = AssignPtrTypeInstr.find(Op); - if (It == AssignPtrTypeInstr.end()) { + CallInst *AssignCI = GR->findAssignPtrTypeInstr(Op); + if (AssignCI == nullptr) { Instruction *User = dyn_cast<Instruction>(Op->use_begin()->get()); setInsertPointSkippingPhis(B, User ? User->getNextNode() : I); CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {OpTy}, OpTyVal, Op, {B.getInt32(getPointerAddressSpace(OpTy))}, B); - AssignPtrTypeInstr[Op] = CI; + GR->addAssignPtrTypeInstr(Op, CI); } else { - It->second->setArgOperand( - 1, - MetadataAsValue::get( - Ctx, MDNode::get(Ctx, ValueAsMetadata::getConstant(OpTyVal)))); + updateAssignType(AssignCI, Op, OpTyVal); } } else { if (auto *OpI = dyn_cast<Instruction>(Op)) { @@ -559,7 +579,9 @@ void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old, if (isAssignTypeInstr(U)) { B.SetInsertPoint(U); SmallVector<Value *, 2> Args = {New, U->getOperand(1)}; - B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args); + CallInst *AssignCI = + B.CreateIntrinsic(Intrinsic::spv_assign_type, {New->getType()}, Args); + GR->addAssignPtrTypeInstr(New, AssignCI); U->eraseFromParent(); } else if (isMemInstrToReplace(U) || isa<ReturnInst>(U) || isa<CallInst>(U)) { @@ -751,33 +773,39 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) { return NewI; } -void SPIRVEmitIntrinsics::insertAssignTypeInstrForTargetExtTypes( +void SPIRVEmitIntrinsics::insertAssignPtrTypeTargetExt( TargetExtType *AssignedType, Value *V, IRBuilder<> &B) { - // Do not emit spv_assign_type if the V is of the AssignedType already. - if (V->getType() == AssignedType) - return; + Type *VTy = V->getType(); - // Do not emit spv_assign_type if there is one already targetting V. If the - // found spv_assign_type assigns a type different than AssignedType, report an - // error. Builtin types cannot be redeclared or casted. - for (auto User : V->users()) { - auto *II = dyn_cast<IntrinsicInst>(User); - if (!II || II->getIntrinsicID() != Intrinsic::spv_assign_type) - continue; + // A couple of sanity checks. + assert(isPointerTy(VTy) && "Expect a pointer type!"); + if (auto PType = dyn_cast<TypedPointerType>(VTy)) + if (PType->getElementType() != AssignedType) + report_fatal_error("Unexpected pointer element type!"); - MetadataAsValue *VMD = cast<MetadataAsValue>(II->getOperand(1)); - Type *BuiltinType = - dyn_cast<ConstantAsMetadata>(VMD->getMetadata())->getType(); - if (BuiltinType != AssignedType) - report_fatal_error("Type mismatch " + BuiltinType->getTargetExtName() + - "/" + AssignedType->getTargetExtName() + - " for value " + V->getName(), - false); + CallInst *AssignCI = GR->findAssignPtrTypeInstr(V); + if (!AssignCI) { + buildAssignType(B, AssignedType, V); return; } - Constant *Const = UndefValue::get(AssignedType); - buildIntrWithMD(Intrinsic::spv_assign_type, {V->getType()}, Const, V, {}, B); + Type *CurrentType = + dyn_cast<ConstantAsMetadata>( + cast<MetadataAsValue>(AssignCI->getOperand(1))->getMetadata()) + ->getType(); + if (CurrentType == AssignedType) + return; + + // Builtin types cannot be redeclared or casted. + if (CurrentType->isTargetExtTy()) + report_fatal_error("Type mismatch " + CurrentType->getTargetExtName() + + "/" + AssignedType->getTargetExtName() + + " for value " + V->getName(), + false); + + // Our previous guess about the type seems to be wrong, let's update + // inferred type according to a new, more precise type information. + updateAssignType(AssignCI, V, PoisonValue::get(AssignedType)); } void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast( @@ -850,7 +878,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast( ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B); GR->addDeducedElementType(CI, ExpectedElementType); GR->addDeducedElementType(Pointer, ExpectedElementType); - AssignPtrTypeInstr[Pointer] = CI; + GR->addAssignPtrTypeInstr(Pointer, CI); return; } @@ -929,8 +957,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) { Value *ArgOperand = CI->getArgOperand(OpIdx); - if (!isa<PointerType>(ArgOperand->getType()) && - !isa<TypedPointerType>(ArgOperand->getType())) + if (!isPointerTy(ArgOperand->getType())) continue; // Constants (nulls/undefs) are handled in insertAssignPtrTypeIntrs() @@ -952,8 +979,8 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I, continue; if (ExpectedType->isTargetExtTy()) - insertAssignTypeInstrForTargetExtTypes(cast<TargetExtType>(ExpectedType), - ArgOperand, B); + insertAssignPtrTypeTargetExt(cast<TargetExtType>(ExpectedType), + ArgOperand, B); else replacePointerOperandWithPtrCast(CI, ArgOperand, ExpectedType, OpIdx, B); } @@ -1145,7 +1172,7 @@ void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I, CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()}, EltTyConst, I, {B.getInt32(AddressSpace)}, B); GR->addDeducedElementType(CI, ElemTy); - AssignPtrTypeInstr[I] = CI; + GR->addAssignPtrTypeInstr(I, CI); } void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, @@ -1164,20 +1191,32 @@ void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I, TypeToAssign = It->second; } } - Constant *Const = UndefValue::get(TypeToAssign); - buildIntrWithMD(Intrinsic::spv_assign_type, {Ty}, Const, I, {}, B); + buildAssignType(B, TypeToAssign, I); } for (const auto &Op : I->operands()) { if (isa<ConstantPointerNull>(Op) || isa<UndefValue>(Op) || // Check GetElementPtrConstantExpr case. (isa<ConstantExpr>(Op) && isa<GEPOperator>(Op))) { setInsertPointSkippingPhis(B, I); - if (isa<UndefValue>(Op) && Op->getType()->isAggregateType()) - buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op, - UndefValue::get(B.getInt32Ty()), {}, B); - else if (!isa<Instruction>(Op)) - buildIntrWithMD(Intrinsic::spv_assign_type, {Op->getType()}, Op, Op, {}, - B); + Type *OpTy = Op->getType(); + if (isa<UndefValue>(Op) && OpTy->isAggregateType()) { + CallInst *AssignCI = + buildIntrWithMD(Intrinsic::spv_assign_type, {B.getInt32Ty()}, Op, + UndefValue::get(B.getInt32Ty()), {}, B); + GR->addAssignPtrTypeInstr(Op, AssignCI); + } else if (!isa<Instruction>(Op)) { + Type *OpTy = Op->getType(); + if (auto PType = dyn_cast<TypedPointerType>(OpTy)) { + buildAssignPtr(B, PType->getElementType(), Op); + } else if (isPointerTy(OpTy)) { + Type *ElemTy = GR->findDeducedElementType(Op); + buildAssignPtr(B, ElemTy ? ElemTy : deduceElementType(Op), Op); + } else { + CallInst *AssignCI = buildIntrWithMD(Intrinsic::spv_assign_type, + {OpTy}, Op, Op, {}, B); + GR->addAssignPtrTypeInstr(Op, AssignCI); + } + } } } } @@ -1368,14 +1407,12 @@ bool SPIRVEmitIntrinsics::runOnFunction(Function &Func) { continue; insertAssignPtrTypeIntrs(I, B); + deduceOperandElementType(I); insertAssignTypeIntrs(I, B); insertPtrCastOrAssignTypeInstr(I, B); insertSpirvDecorations(I, B); } - for (auto &I : instructions(Func)) - deduceOperandElementType(&I); - for (auto *I : Worklist) { TrackConstants = true; if (!I->getType()->isVoidTy() || isa<StoreInst>(I)) diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h index ef0973d..db01f68 100644 --- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h +++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h @@ -73,8 +73,11 @@ class SPIRVGlobalRegistry { // untyped pointers. DenseMap<Value *, Type *> DeducedElTys; // Maps composite values to deduced types where untyped pointers are replaced - // with typed ones + // with typed ones. DenseMap<Value *, Type *> DeducedNestedTys; + // Maps values to "assign type" calls, thus being a registry of created + // Intrinsic::spv_assign_ptr_type instructions. + DenseMap<Value *, CallInst *> AssignPtrTypeInstr; // Add a new OpTypeXXX instruction without checking for duplicates. SPIRVType *createSPIRVType(const Type *Type, MachineIRBuilder &MIRBuilder, @@ -149,6 +152,17 @@ public: return It == FunResPointerTypes.end() ? nullptr : It->second; } + // A registry of "assign type" records: + // - Add a record. + void addAssignPtrTypeInstr(Value *Val, CallInst *AssignPtrTyCI) { + AssignPtrTypeInstr[Val] = AssignPtrTyCI; + } + // - Find a record. + CallInst *findAssignPtrTypeInstr(const Value *Val) { + auto It = AssignPtrTypeInstr.find(Val); + return It == AssignPtrTypeInstr.end() ? nullptr : It->second; + } + // Deduced element types of untyped pointers and composites: // - Add a record to the map of deduced element types. void addDeducedElementType(Value *Val, Type *Ty) { DeducedElTys[Val] = Ty; } diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp index 3d53608..a0a253c 100644 --- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp @@ -417,7 +417,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); // G_GLOBAL_VALUE already has type info. - if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE) + if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE && + Def->getOpcode() != SPIRV::ASSIGN_TYPE) insertAssignInstr(Reg, nullptr, AssignedPtrType, GR, MIB, MF.getRegInfo()); ToErase.push_back(&MI); @@ -427,7 +428,8 @@ generateAssignInstrs(MachineFunction &MF, SPIRVGlobalRegistry *GR, MachineInstr *Def = MRI.getVRegDef(Reg); assert(Def && "Expecting an instruction that defines the register"); // G_GLOBAL_VALUE already has type info. - if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE) + if (Def->getOpcode() != TargetOpcode::G_GLOBAL_VALUE && + Def->getOpcode() != SPIRV::ASSIGN_TYPE) insertAssignInstr(Reg, Ty, nullptr, GR, MIB, MF.getRegInfo()); ToErase.push_back(&MI); } else if (MIOp == TargetOpcode::G_CONSTANT || diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp index 8e20631..f5bc584 100644 --- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp +++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp @@ -178,14 +178,15 @@ static wasm::WasmLimits DefaultLimits() { } static MCSymbolWasm *GetOrCreateFunctionTableSymbol(MCContext &Ctx, - const StringRef &Name) { + const StringRef &Name, + bool is64) { MCSymbolWasm *Sym = cast_or_null<MCSymbolWasm>(Ctx.lookupSymbol(Name)); if (Sym) { if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); - Sym->setFunctionTable(); + Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); } @@ -258,7 +259,7 @@ public: MCAsmParserExtension::Initialize(Parser); DefaultFunctionTable = GetOrCreateFunctionTableSymbol( - getContext(), "__indirect_function_table"); + getContext(), "__indirect_function_table", is64); if (!STI->checkFeatures("+reference-types")) DefaultFunctionTable->setOmitFromLinkingSection(); } @@ -508,7 +509,7 @@ public: auto &Tok = Lexer.getTok(); if (Tok.is(AsmToken::Identifier)) { auto *Sym = - GetOrCreateFunctionTableSymbol(getContext(), Tok.getString()); + GetOrCreateFunctionTableSymbol(getContext(), Tok.getString(), is64); const auto *Val = MCSymbolRefExpr::create(Sym, getContext()); *Op = std::make_unique<WebAssemblyOperand>( WebAssemblyOperand::Symbol, Tok.getLoc(), Tok.getEndLoc(), @@ -836,6 +837,9 @@ public: // symbol auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName)); WasmSym->setType(wasm::WASM_SYMBOL_TYPE_TABLE); + if (is64) { + Limits.Flags |= wasm::WASM_LIMITS_FLAG_IS_64; + } wasm::WasmTableType Type = {*ElemType, Limits}; WasmSym->setTableType(Type); TOut.emitTableType(WasmSym); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp index 5e727980..c5a047e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp @@ -108,8 +108,9 @@ MCSymbolWasm *WebAssembly::getOrCreateFunctionTableSymbol( if (!Sym->isFunctionTable()) Ctx.reportError(SMLoc(), "symbol is not a wasm funcref table"); } else { + bool is64 = Subtarget && Subtarget->getTargetTriple().isArch64Bit(); Sym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(Name)); - Sym->setFunctionTable(); + Sym->setFunctionTable(is64); // The default function table is synthesized by the linker. Sym->setUndefined(); } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 0bf3294..3933e82 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -5120,6 +5120,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_tileloaddt164_internal: { if (!Subtarget->hasAMXTILE()) break; + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal ? X86::PTILELOADDV : X86::PTILELOADDT1V; @@ -5201,6 +5204,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { break; } case Intrinsic::x86_tilestored64_internal: { + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); unsigned Opc = X86::PTILESTOREDV; // _tile_stored_internal(row, col, buf, STRIDE, c) SDValue Base = Node->getOperand(4); @@ -5228,6 +5234,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { case Intrinsic::x86_tilestored64: { if (!Subtarget->hasAMXTILE()) break; + auto *MFI = + CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); unsigned Opc; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 7d30de1..3fbab3a 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -615,6 +615,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FSIN, VT, Action); setOperationAction(ISD::FCOS, VT, Action); setOperationAction(ISD::FSINCOS, VT, Action); + setOperationAction(ISD::FTAN, VT, Action); setOperationAction(ISD::FSQRT, VT, Action); setOperationAction(ISD::FPOW, VT, Action); setOperationAction(ISD::FLOG, VT, Action); @@ -833,9 +834,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } // Always expand sin/cos functions even though x87 has an instruction. + // clang-format off setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); + setOperationAction(ISD::FTAN , MVT::f80, Expand); + // clang-format on setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); @@ -888,11 +892,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FNEG, MVT::f128, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); + // clang-format off setOperationAction(ISD::FSIN, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); setOperationAction(ISD::FCOS, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); + setOperationAction(ISD::FTAN, MVT::f128, LibCall); + setOperationAction(ISD::STRICT_FTAN, MVT::f128, LibCall); + // clang-format on // No STRICT_FSINCOS setOperationAction(ISD::FSQRT, MVT::f128, LibCall); setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); @@ -944,9 +952,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { + // clang-format off setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); + setOperationAction(ISD::FTAN, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); @@ -956,6 +966,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FEXP10, VT, Expand); + // clang-format on } // First set operation action for all vector types to either promote @@ -2473,7 +2484,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // function casting to f64 and calling `fmod`. if (Subtarget.is32Bit() && (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) - for (ISD::NodeType Op : + // clang-format off + for (ISD::NodeType Op : {ISD::FCEIL, ISD::STRICT_FCEIL, ISD::FCOS, ISD::STRICT_FCOS, ISD::FEXP, ISD::STRICT_FEXP, @@ -2482,9 +2494,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, ISD::FLOG, ISD::STRICT_FLOG, ISD::FLOG10, ISD::STRICT_FLOG10, ISD::FPOW, ISD::STRICT_FPOW, - ISD::FSIN, ISD::STRICT_FSIN}) + ISD::FSIN, ISD::STRICT_FSIN, + ISD::FTAN, ISD::STRICT_FTAN}) if (isOperationExpand(Op, MVT::f32)) setOperationAction(Op, MVT::f32, Promote); + // clang-format on // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine({ISD::VECTOR_SHUFFLE, @@ -26776,7 +26790,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case Intrinsic::swift_async_context_addr: { SDLoc dl(Op); auto &MF = DAG.getMachineFunction(); - auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); + auto *X86FI = MF.getInfo<X86MachineFunctionInfo>(); if (X86::isExtendedSwiftAsyncFrameSupported(Subtarget, MF)) { MF.getFrameInfo().setFrameAddressIsTaken(true); X86FI->setHasSwiftAsyncContext(true); @@ -36781,7 +36795,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, } case TargetOpcode::PREALLOCATED_SETUP: { assert(Subtarget.is32Bit() && "preallocated only used in 32-bit"); - auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); MFI->setHasPreallocatedCall(true); int64_t PreallocatedId = MI.getOperand(0).getImm(); size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId); @@ -36798,7 +36812,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit"); int64_t PreallocatedId = MI.getOperand(1).getImm(); int64_t ArgIdx = MI.getOperand(2).getImm(); - auto MFI = MF->getInfo<X86MachineFunctionInfo>(); + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx << ", arg offset " << ArgOffset << "\n"); @@ -36841,6 +36855,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, unsigned Imm = MI.getOperand(0).getImm(); BuildMI(*BB, MI, MIMD, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); MI.eraseFromParent(); // The pseudo is gone now. + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::DirectReg); + return BB; + } + case X86::PTILEZEROV: { + auto *MFI = MF->getInfo<X86MachineFunctionInfo>(); + MFI->setAMXProgModel(AMXProgModelEnum::ManagedRA); return BB; } case X86::PTILELOADD: diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index c47bee0..99deacc 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -74,7 +74,7 @@ let SchedRW = [WriteSystem] in { GR16:$src2, opaquemem:$src3, TILE:$src4), []>; let isPseudo = true, isReMaterializable = 1, isAsCheapAsAMove = 1, - canFoldAsLoad = 1 in + canFoldAsLoad = 1, usesCustomInserter = 1 in def PTILEZEROV : PseudoI<(outs TILE:$dst), (ins GR16:$src1, GR16:$src2), [(set TILE:$dst, (int_x86_tilezero_internal GR16:$src1, GR16:$src2))]>; diff --git a/llvm/lib/Target/X86/X86LowerAMXType.cpp b/llvm/lib/Target/X86/X86LowerAMXType.cpp index b690587..079ac98 100644 --- a/llvm/lib/Target/X86/X86LowerAMXType.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXType.cpp @@ -92,6 +92,14 @@ static bool isAMXIntrinsic(Value *I) { return false; } +static bool containsAMXCode(Function &F) { + for (BasicBlock &BB : F) + for (Instruction &I : BB) + if (I.getType()->isX86_AMXTy()) + return true; + return false; +} + static AllocaInst *createAllocaInstAtEntry(IRBuilder<> &Builder, BasicBlock *BB, Type *Ty) { Function &F = *BB->getParent(); @@ -1230,6 +1238,14 @@ public: } bool runOnFunction(Function &F) override { + // Performance optimization: most code doesn't use AMX, so return early if + // there are no instructions that produce AMX values. This is sufficient, as + // AMX arguments and constants are not allowed -- so any producer of an AMX + // value must be an instruction. + // TODO: find a cheaper way for this, without looking at all instructions. + if (!containsAMXCode(F)) + return false; + bool C = false; TargetMachine *TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>(); TargetLibraryInfo *TLI = diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp index f27676a..613722b 100644 --- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp +++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp @@ -19,6 +19,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" +#include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/LiveRegUnits.h" #include "llvm/CodeGen/MachineBasicBlock.h" @@ -71,6 +72,10 @@ FunctionPass *llvm::createX86LowerTileCopyPass() { } bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) { + X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); + if (FuncInfo->getAMXProgModel() != AMXProgModelEnum::ManagedRA) + return false; + const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>(); const X86InstrInfo *TII = ST.getInstrInfo(); const TargetRegisterInfo *TRI = ST.getRegisterInfo(); @@ -81,26 +86,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; for (MachineBasicBlock &MBB : MF) { - // There won't be a tile copy if neither tile register live in nor live out. - bool HasTileCopy = false; - for (const auto &LI : MBB.liveins()) { - if (TILERegs.test(LI.PhysReg)) { - HasTileCopy = true; - break; - } - } LiveRegUnits UsedRegs(*TRI); UsedRegs.addLiveOuts(MBB); - if (!HasTileCopy) { - for (auto RegT : TILERegs.set_bits()) { - if (UsedRegs.available(RegT)) { - HasTileCopy = true; - break; - } - } - } - if (!HasTileCopy) - continue; for (MachineInstr &MI : llvm::make_early_inc_range(reverse(MBB))) { UsedRegs.stepBackward(MI); if (!MI.isCopy()) diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h index f6e8532..8aaa499 100644 --- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h +++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h @@ -21,6 +21,8 @@ namespace llvm { +enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 }; + /// X86MachineFunctionInfo - This class is derived from MachineFunction and /// contains private X86 target-specific information for each MachineFunction. class X86MachineFunctionInfo : public MachineFunctionInfo { @@ -96,6 +98,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo { /// used to address arguments in a function using a base pointer. int SEHFramePtrSaveIndex = 0; + /// The AMX programing model used in the function. + AMXProgModelEnum AMXProgModel = AMXProgModelEnum::None; + /// True if this function has a subset of CSRs that is handled explicitly via /// copies. bool IsSplitCSR = false; @@ -219,6 +224,13 @@ public: int getSEHFramePtrSaveIndex() const { return SEHFramePtrSaveIndex; } void setSEHFramePtrSaveIndex(int Index) { SEHFramePtrSaveIndex = Index; } + AMXProgModelEnum getAMXProgModel() const { return AMXProgModel; } + void setAMXProgModel(AMXProgModelEnum Model) { + assert((AMXProgModel == AMXProgModelEnum::None || AMXProgModel == Model) && + "mixed model is not supported"); + AMXProgModel = Model; + } + SmallVectorImpl<ForwardedRegister> &getForwardedMustTailRegParms() { return ForwardedMustTailRegParms; } diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td index 2d29677..186d4d8 100644 --- a/llvm/lib/Target/X86/X86SchedIceLake.td +++ b/llvm/lib/Target/X86/X86SchedIceLake.td @@ -620,11 +620,11 @@ def : WriteRes<WriteNop, []>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort015], 6, [2,1], 3, 6>; -defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort015], 6, [2,1], 3, 7>; +defm : ICXWriteResPair<WriteFHAdd, [ICXPort5,ICXPort01], 6, [2,1], 3, 6>; +defm : ICXWriteResPair<WriteFHAddY, [ICXPort5,ICXPort01], 6, [2,1], 3, 7>; defm : ICXWriteResPair<WritePHAdd, [ICXPort5,ICXPort05], 3, [2,1], 3, 5>; -defm : ICXWriteResPair<WritePHAddX, [ICXPort5,ICXPort015], 3, [2,1], 3, 6>; -defm : ICXWriteResPair<WritePHAddY, [ICXPort5,ICXPort015], 3, [2,1], 3, 7>; +defm : ICXWriteResPair<WritePHAddX, [ICXPort15,ICXPort015], 3, [2,1], 3, 6>; +defm : ICXWriteResPair<WritePHAddY, [ICXPort15,ICXPort015], 3, [2,1], 3, 7>; // Remaining instrs. @@ -886,7 +886,7 @@ def ICXWriteResGroup37 : SchedWriteRes<[ICXPort0,ICXPort5]> { } def: InstRW<[ICXWriteResGroup37], (instregex "MMX_PH(ADD|SUB)SWrr")>; -def ICXWriteResGroup38 : SchedWriteRes<[ICXPort5,ICXPort01]> { +def ICXWriteResGroup38 : SchedWriteRes<[ICXPort15,ICXPort01]> { let Latency = 3; let NumMicroOps = 3; let ReleaseAtCycles = [2,1]; @@ -1739,13 +1739,13 @@ def ICXWriteResGroup137 : SchedWriteRes<[ICXPort23,ICXPort01]> { def: InstRW<[ICXWriteResGroup137], (instregex "MMX_CVT(T?)PS2PIrm", "(V?)CVTPS2PDrm")>; -def ICXWriteResGroup143 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> { +def ICXWriteResGroup143 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> { let Latency = 9; let NumMicroOps = 4; let ReleaseAtCycles = [2,1,1]; } -def: InstRW<[ICXWriteResGroup143], (instregex "(V?)PHADDSWrm", - "(V?)PHSUBSWrm")>; +def: InstRW<[ICXWriteResGroup143], (instrs PHADDSWrm, VPHADDSWrm, + PHSUBSWrm, VPHSUBSWrm)>; def ICXWriteResGroup146 : SchedWriteRes<[ICXPort1,ICXPort6,ICXPort23,ICXPort0156]> { let Latency = 9; @@ -1842,7 +1842,7 @@ def: InstRW<[ICXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)", "VPEXPANDDZ128rm(b?)", "VPEXPANDQZ128rm(b?)")>; -def ICXWriteResGroup154 : SchedWriteRes<[ICXPort5,ICXPort01,ICXPort23]> { +def ICXWriteResGroup154 : SchedWriteRes<[ICXPort15,ICXPort01,ICXPort23]> { let Latency = 10; let NumMicroOps = 4; let ReleaseAtCycles = [2,1,1]; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index a7dff0e..4fded44 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -615,8 +615,8 @@ def : WriteRes<WriteNop, []>; // Horizontal add/sub instructions. //////////////////////////////////////////////////////////////////////////////// -defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort015], 6, [2,1], 3, 6>; -defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort015], 6, [2,1], 3, 7>; +defm : SKXWriteResPair<WriteFHAdd, [SKXPort5,SKXPort01], 6, [2,1], 3, 6>; +defm : SKXWriteResPair<WriteFHAddY, [SKXPort5,SKXPort01], 6, [2,1], 3, 7>; defm : SKXWriteResPair<WritePHAdd, [SKXPort5,SKXPort05], 3, [2,1], 3, 5>; defm : SKXWriteResPair<WritePHAddX, [SKXPort5,SKXPort015], 3, [2,1], 3, 6>; defm : SKXWriteResPair<WritePHAddY, [SKXPort5,SKXPort015], 3, [2,1], 3, 7>; diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 68155ac..b3b8486 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -302,6 +302,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) { .Case("0x805", "cortex-a76") // Kryo 4xx/5xx Silver .Case("0xc00", "falkor") .Case("0xc01", "saphira") + .Case("0x001", "oryon-1") .Default("generic"); if (Implementer == "0x53") { // Samsung Electronics Co., Ltd. // The Exynos chips have a convoluted ID scheme that doesn't seem to follow diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 7464237..60a784e 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -124,6 +124,7 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx1103"}, {"gfx1103"}, GK_GFX1103, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1150"}, {"gfx1150"}, GK_GFX1150, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1151"}, {"gfx1151"}, GK_GFX1151, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, + {{"gfx1152"}, {"gfx1152"}, GK_GFX1152, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1200"}, {"gfx1200"}, GK_GFX1200, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, {{"gfx1201"}, {"gfx1201"}, GK_GFX1201, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_WGP}, @@ -275,6 +276,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX1103: return {11, 0, 3}; case GK_GFX1150: return {11, 5, 0}; case GK_GFX1151: return {11, 5, 1}; + case GK_GFX1152: return {11, 5, 2}; case GK_GFX1200: return {12, 0, 0}; case GK_GFX1201: return {12, 0, 1}; @@ -341,6 +343,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["image-insts"] = true; Features["fp8-conversion-insts"] = true; break; + case GK_GFX1152: case GK_GFX1151: case GK_GFX1150: case GK_GFX1103: @@ -542,6 +545,7 @@ static bool isWave32Capable(StringRef GPU, const Triple &T) { switch (parseArchAMDGCN(GPU)) { case GK_GFX1201: case GK_GFX1200: + case GK_GFX1152: case GK_GFX1151: case GK_GFX1150: case GK_GFX1103: diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index 9a5732dc..549d036 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -419,7 +419,8 @@ struct AAReturnedFromReturnedValues : public BaseType { /// See AbstractAttribute::updateImpl(...). ChangeStatus updateImpl(Attributor &A) override { StateType S(StateType::getBestState(this->getState())); - clampReturnedValueStates<AAType, StateType, IRAttributeKind, RecurseForSelectAndPHI>( + clampReturnedValueStates<AAType, StateType, IRAttributeKind, + RecurseForSelectAndPHI>( A, *this, S, PropagateCallBaseContext ? this->getCallBaseContext() : nullptr); // TODO: If we know we visited all returned values, thus no are assumed @@ -6973,10 +6974,9 @@ ChangeStatus AAHeapToStackFunction::updateImpl(Attributor &A) { if (AI.LibraryFunctionId != LibFunc___kmpc_alloc_shared) { Instruction *CtxI = isa<InvokeInst>(AI.CB) ? AI.CB : AI.CB->getNextNode(); if (!Explorer || !Explorer->findInContextOf(UniqueFree, CtxI)) { - LLVM_DEBUG( - dbgs() - << "[H2S] unique free call might not be executed with the allocation " - << *UniqueFree << "\n"); + LLVM_DEBUG(dbgs() << "[H2S] unique free call might not be executed " + "with the allocation " + << *UniqueFree << "\n"); return false; } } @@ -10406,11 +10406,12 @@ struct AANoFPClassFloating : public AANoFPClassImpl { struct AANoFPClassReturned final : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl, - AANoFPClassImpl::StateType, false, Attribute::None, false> { + AANoFPClassImpl::StateType, false, + Attribute::None, false> { AANoFPClassReturned(const IRPosition &IRP, Attributor &A) : AAReturnedFromReturnedValues<AANoFPClass, AANoFPClassImpl, - AANoFPClassImpl::StateType, false, Attribute::None, false>( - IRP, A) {} + AANoFPClassImpl::StateType, false, + Attribute::None, false>(IRP, A) {} /// See AbstractAttribute::trackStatistics() void trackStatistics() const override { diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt index 5fbdbc3..92a9697 100644 --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo DeadArgumentElimination.cpp ElimAvailExtern.cpp EmbedBitcodePass.cpp + ExpandVariadics.cpp ExtractGV.cpp ForceFunctionAttrs.cpp FunctionAttrs.cpp diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp new file mode 100644 index 0000000..d340bc0 --- /dev/null +++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp @@ -0,0 +1,1012 @@ +//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is an optimization pass for variadic functions. If called from codegen, +// it can serve as the implementation of variadic functions for a given target. +// +// The strategy is to turn the ... part of a variadic function into a va_list +// and fix up the call sites. The majority of the pass is target independent. +// The exceptions are the va_list type itself and the rules for where to store +// variables in memory such that va_arg can iterate over them given a va_list. +// +// The majority of the plumbing is splitting the variadic function into a +// single basic block that packs the variadic arguments into a va_list and +// a second function that does the work of the original. That packing is +// exactly what is done by va_start. Further, the transform from ... to va_list +// replaced va_start with an operation to copy a va_list from the new argument, +// which is exactly a va_copy. This is useful for reducing target-dependence. +// +// A va_list instance is a forward iterator, where the primary operation va_arg +// is dereference-then-increment. This interface forces significant convergent +// evolution between target specific implementations. The variation in runtime +// data layout is limited to that representable by the iterator, parameterised +// by the type passed to the va_arg instruction. +// +// Therefore the majority of the target specific subtlety is packing arguments +// into a stack allocated buffer such that a va_list can be initialised with it +// and the va_arg expansion for the target will find the arguments at runtime. +// +// The aggregate effect is to unblock other transforms, most critically the +// general purpose inliner. Known calls to variadic functions become zero cost. +// +// Consistency with clang is primarily tested by emitting va_arg using clang +// then expanding the variadic functions using this pass, followed by trying +// to constant fold the functions to no-ops. +// +// Target specific behaviour is tested in IR - mainly checking that values are +// put into positions in call frames that make sense for that particular target. +// +// There is one "clever" invariant in use. va_start intrinsics that are not +// within a varidic functions are an error in the IR verifier. When this +// transform moves blocks from a variadic function into a fixed arity one, it +// moves va_start intrinsics along with everything else. That means that the +// va_start intrinsics that need to be rewritten to use the trailing argument +// are exactly those that are in non-variadic functions so no further state +// is needed to distinguish those that need to be rewritten. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ExpandVariadics.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/TargetParser/Triple.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +#define DEBUG_TYPE "expand-variadics" + +using namespace llvm; + +namespace { + +cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption( + DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE), + cl::init(ExpandVariadicsMode::Unspecified), + cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified", + "Use the implementation defaults"), + clEnumValN(ExpandVariadicsMode::Disable, "disable", + "Disable the pass entirely"), + clEnumValN(ExpandVariadicsMode::Optimize, "optimize", + "Optimise without changing ABI"), + clEnumValN(ExpandVariadicsMode::Lowering, "lowering", + "Change variadic calling convention"))); + +bool commandLineOverride() { + return ExpandVariadicsModeOption != ExpandVariadicsMode::Unspecified; +} + +// Instances of this class encapsulate the target-dependant behaviour as a +// function of triple. Implementing a new ABI is adding a case to the switch +// in create(llvm::Triple) at the end of this file. +// This class may end up instantiated in TargetMachine instances, keeping it +// here for now until enough targets are implemented for the API to evolve. +class VariadicABIInfo { +protected: + VariadicABIInfo() = default; + +public: + static std::unique_ptr<VariadicABIInfo> create(const Triple &T); + + // Allow overriding whether the pass runs on a per-target basis + virtual bool enableForTarget() = 0; + + // Whether a valist instance is passed by value or by address + // I.e. does it need to be alloca'ed and stored into, or can + // it be passed directly in a SSA register + virtual bool vaListPassedInSSARegister() = 0; + + // The type of a va_list iterator object + virtual Type *vaListType(LLVMContext &Ctx) = 0; + + // The type of a va_list as a function argument as lowered by C + virtual Type *vaListParameterType(Module &M) = 0; + + // Initialize an allocated va_list object to point to an already + // initialized contiguous memory region. + // Return the value to pass as the va_list argument + virtual Value *initializeVaList(Module &M, LLVMContext &Ctx, + IRBuilder<> &Builder, AllocaInst *VaList, + Value *Buffer) = 0; + + struct VAArgSlotInfo { + Align DataAlign; // With respect to the call frame + bool Indirect; // Passed via a pointer + }; + virtual VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) = 0; + + // Targets implemented so far all have the same trivial lowering for these + bool vaEndIsNop() { return true; } + bool vaCopyIsMemcpy() { return true; } + + virtual ~VariadicABIInfo() = default; +}; + +// Module implements getFunction() which returns nullptr on missing declaration +// and getOrInsertFunction which creates one when absent. Intrinsics.h only +// implements getDeclaration which creates one when missing. Checking whether +// an intrinsic exists thus inserts it in the module and it then needs to be +// deleted again to clean up. +// The right name for the two functions on intrinsics would match Module::, +// but doing that in a single change would introduce nullptr dereferences +// where currently there are none. The minimal collateral damage approach +// would split the change over a release to help downstream branches. As it +// is unclear what approach will be preferred, implementing the trivial +// function here in the meantime to decouple from that discussion. +Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id, + ArrayRef<Type *> Tys = {}) { + auto *FT = Intrinsic::getType(M->getContext(), Id, Tys); + return M->getFunction(Tys.empty() ? Intrinsic::getName(Id) + : Intrinsic::getName(Id, Tys, M, FT)); +} + +class ExpandVariadics : public ModulePass { + + // The pass construction sets the default to optimize when called from middle + // end and lowering when called from the backend. The command line variable + // overrides that. This is useful for testing and debugging. It also allows + // building an applications with variadic functions wholly removed if one + // has sufficient control over the dependencies, e.g. a statically linked + // clang that has no variadic function calls remaining in the binary. + +public: + static char ID; + const ExpandVariadicsMode Mode; + std::unique_ptr<VariadicABIInfo> ABI; + + ExpandVariadics(ExpandVariadicsMode Mode) + : ModulePass(ID), + Mode(commandLineOverride() ? ExpandVariadicsModeOption : Mode) {} + + StringRef getPassName() const override { return "Expand variadic functions"; } + + bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; } + + bool runOnModule(Module &M) override; + + bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F); + + Function *replaceAllUsesWithNewDeclaration(Module &M, + Function *OriginalFunction); + + Function *deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction); + + Function *defineVariadicWrapper(Module &M, IRBuilder<> &Builder, + Function *VariadicWrapper, + Function *FixedArityReplacement); + + bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *, + Function *NF); + + // The intrinsic functions va_copy and va_end are removed unconditionally. + // They correspond to a memcpy and a no-op on all implemented targets. + // The va_start intrinsic is removed from basic blocks that were not created + // by this pass, some may remain if needed to maintain the external ABI. + + template <Intrinsic::ID ID, typename InstructionType> + bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder, + PointerType *IntrinsicArgType) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + if (Function *Intrinsic = + getPreexistingDeclaration(&M, ID, {IntrinsicArgType})) { + for (User *U : make_early_inc_range(Intrinsic->users())) + if (auto *I = dyn_cast<InstructionType>(U)) + Changed |= expandVAIntrinsicCall(Builder, DL, I); + + if (Intrinsic->use_empty()) + Intrinsic->eraseFromParent(); + } + return Changed; + } + + bool expandVAIntrinsicUsersWithAddrspace(Module &M, IRBuilder<> &Builder, + unsigned Addrspace) { + auto &Ctx = M.getContext(); + PointerType *IntrinsicArgType = PointerType::get(Ctx, Addrspace); + bool Changed = false; + + // expand vastart before vacopy as vastart may introduce a vacopy + Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>( + M, Builder, IntrinsicArgType); + Changed |= expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>( + M, Builder, IntrinsicArgType); + Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>( + M, Builder, IntrinsicArgType); + return Changed; + } + + bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL, + VAStartInst *Inst); + + bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &, + VAEndInst *Inst); + + bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL, + VACopyInst *Inst); + + FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) { + // The type of "FTy" with the ... removed and a va_list appended + SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end()); + ArgTypes.push_back(ABI->vaListParameterType(M)); + return FunctionType::get(FTy->getReturnType(), ArgTypes, + /*IsVarArgs=*/false); + } + + static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL, + AllocaInst *Alloced) { + std::optional<TypeSize> AllocaTypeSize = Alloced->getAllocationSize(DL); + uint64_t AsInt = AllocaTypeSize ? AllocaTypeSize->getFixedValue() : 0; + return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt); + } + + bool expansionApplicableToFunction(Module &M, Function *F) { + if (F->isIntrinsic() || !F->isVarArg() || + F->hasFnAttribute(Attribute::Naked)) + return false; + + if (F->getCallingConv() != CallingConv::C) + return false; + + if (rewriteABI()) + return true; + + if (!F->hasExactDefinition()) + return false; + + return true; + } + + bool expansionApplicableToFunctionCall(CallBase *CB) { + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + if (CI->isMustTailCall()) { + // Cannot expand musttail calls + return false; + } + + if (CI->getCallingConv() != CallingConv::C) + return false; + + return true; + } + + if (isa<InvokeInst>(CB)) { + // Invoke not implemented in initial implementation of pass + return false; + } + + // Other unimplemented derivative of CallBase + return false; + } + + class ExpandedCallFrame { + // Helper for constructing an alloca instance containing the arguments bound + // to the variadic ... parameter, rearranged to allow indexing through a + // va_list iterator + enum { N = 4 }; + SmallVector<Type *, N> FieldTypes; + enum Tag { Store, Memcpy, Padding }; + SmallVector<std::tuple<Value *, uint64_t, Tag>, N> Source; + + template <Tag tag> void append(Type *FieldType, Value *V, uint64_t Bytes) { + FieldTypes.push_back(FieldType); + Source.push_back({V, Bytes, tag}); + } + + public: + void store(LLVMContext &Ctx, Type *T, Value *V) { append<Store>(T, V, 0); } + + void memcpy(LLVMContext &Ctx, Type *T, Value *V, uint64_t Bytes) { + append<Memcpy>(T, V, Bytes); + } + + void padding(LLVMContext &Ctx, uint64_t By) { + append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr, 0); + } + + size_t size() const { return FieldTypes.size(); } + bool empty() const { return FieldTypes.empty(); } + + StructType *asStruct(LLVMContext &Ctx, StringRef Name) { + const bool IsPacked = true; + return StructType::create(Ctx, FieldTypes, + (Twine(Name) + ".vararg").str(), IsPacked); + } + + void initializeStructAlloca(const DataLayout &DL, IRBuilder<> &Builder, + AllocaInst *Alloced) { + + StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType()); + + for (size_t I = 0; I < size(); I++) { + + auto [V, bytes, tag] = Source[I]; + + if (tag == Padding) { + assert(V == nullptr); + continue; + } + + auto Dst = Builder.CreateStructGEP(VarargsTy, Alloced, I); + + assert(V != nullptr); + + if (tag == Store) + Builder.CreateStore(V, Dst); + + if (tag == Memcpy) + Builder.CreateMemCpy(Dst, {}, V, {}, bytes); + } + } + }; +}; + +bool ExpandVariadics::runOnModule(Module &M) { + bool Changed = false; + if (Mode == ExpandVariadicsMode::Disable) + return Changed; + + Triple TT(M.getTargetTriple()); + ABI = VariadicABIInfo::create(TT); + if (!ABI) + return Changed; + + if (!ABI->enableForTarget()) + return Changed; + + auto &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + IRBuilder<> Builder(Ctx); + + // Lowering needs to run on all functions exactly once. + // Optimize could run on functions containing va_start exactly once. + for (Function &F : make_early_inc_range(M)) + Changed |= runOnFunction(M, Builder, &F); + + // After runOnFunction, all known calls to known variadic functions have been + // replaced. va_start intrinsics are presently (and invalidly!) only present + // in functions that used to be variadic and have now been replaced to take a + // va_list instead. If lowering as opposed to optimising, calls to unknown + // variadic functions have also been replaced. + + { + // 0 and AllocaAddrSpace are sufficient for the targets implemented so far + unsigned Addrspace = 0; + Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace); + + Addrspace = DL.getAllocaAddrSpace(); + if (Addrspace != 0) + Changed |= expandVAIntrinsicUsersWithAddrspace(M, Builder, Addrspace); + } + + if (Mode != ExpandVariadicsMode::Lowering) + return Changed; + + for (Function &F : make_early_inc_range(M)) { + if (F.isDeclaration()) + continue; + + // Now need to track down indirect calls. Can't find those + // by walking uses of variadic functions, need to crawl the instruction + // stream. Fortunately this is only necessary for the ABI rewrite case. + for (BasicBlock &BB : F) { + for (Instruction &I : make_early_inc_range(BB)) { + if (CallBase *CB = dyn_cast<CallBase>(&I)) { + if (CB->isIndirectCall()) { + FunctionType *FTy = CB->getFunctionType(); + if (FTy->isVarArg()) + Changed |= expandCall(M, Builder, CB, FTy, 0); + } + } + } + } + } + + return Changed; +} + +bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction) { + bool Changed = false; + + if (!expansionApplicableToFunction(M, OriginalFunction)) + return Changed; + + [[maybe_unused]] const bool OriginalFunctionIsDeclaration = + OriginalFunction->isDeclaration(); + assert(rewriteABI() || !OriginalFunctionIsDeclaration); + + // Declare a new function and redirect every use to that new function + Function *VariadicWrapper = + replaceAllUsesWithNewDeclaration(M, OriginalFunction); + assert(VariadicWrapper->isDeclaration()); + assert(OriginalFunction->use_empty()); + + // Create a new function taking va_list containing the implementation of the + // original + Function *FixedArityReplacement = + deriveFixedArityReplacement(M, Builder, OriginalFunction); + assert(OriginalFunction->isDeclaration()); + assert(FixedArityReplacement->isDeclaration() == + OriginalFunctionIsDeclaration); + assert(VariadicWrapper->isDeclaration()); + + // Create a single block forwarding wrapper that turns a ... into a va_list + [[maybe_unused]] Function *VariadicWrapperDefine = + defineVariadicWrapper(M, Builder, VariadicWrapper, FixedArityReplacement); + assert(VariadicWrapperDefine == VariadicWrapper); + assert(!VariadicWrapper->isDeclaration()); + + // We now have: + // 1. the original function, now as a declaration with no uses + // 2. a variadic function that unconditionally calls a fixed arity replacement + // 3. a fixed arity function equivalent to the original function + + // Replace known calls to the variadic with calls to the va_list equivalent + for (User *U : make_early_inc_range(VariadicWrapper->users())) { + if (CallBase *CB = dyn_cast<CallBase>(U)) { + Value *calledOperand = CB->getCalledOperand(); + if (VariadicWrapper == calledOperand) + Changed |= + expandCall(M, Builder, CB, VariadicWrapper->getFunctionType(), + FixedArityReplacement); + } + } + + // The original function will be erased. + // One of the two new functions will become a replacement for the original. + // When preserving the ABI, the other is an internal implementation detail. + // When rewriting the ABI, RAUW then the variadic one. + Function *const ExternallyAccessible = + rewriteABI() ? FixedArityReplacement : VariadicWrapper; + Function *const InternalOnly = + rewriteABI() ? VariadicWrapper : FixedArityReplacement; + + // The external function is the replacement for the original + ExternallyAccessible->setLinkage(OriginalFunction->getLinkage()); + ExternallyAccessible->setVisibility(OriginalFunction->getVisibility()); + ExternallyAccessible->setComdat(OriginalFunction->getComdat()); + ExternallyAccessible->takeName(OriginalFunction); + + // Annotate the internal one as internal + InternalOnly->setVisibility(GlobalValue::DefaultVisibility); + InternalOnly->setLinkage(GlobalValue::InternalLinkage); + + // The original is unused and obsolete + OriginalFunction->eraseFromParent(); + + InternalOnly->removeDeadConstantUsers(); + + if (rewriteABI()) { + // All known calls to the function have been removed by expandCall + // Resolve everything else by replaceAllUsesWith + VariadicWrapper->replaceAllUsesWith(FixedArityReplacement); + VariadicWrapper->eraseFromParent(); + } + + return Changed; +} + +Function * +ExpandVariadics::replaceAllUsesWithNewDeclaration(Module &M, + Function *OriginalFunction) { + auto &Ctx = M.getContext(); + Function &F = *OriginalFunction; + FunctionType *FTy = F.getFunctionType(); + Function *NF = Function::Create(FTy, F.getLinkage(), F.getAddressSpace()); + + NF->setName(F.getName() + ".varargs"); + NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat; + + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + + AttrBuilder ParamAttrs(Ctx); + AttributeList Attrs = NF->getAttributes(); + Attrs = Attrs.addParamAttributes(Ctx, FTy->getNumParams(), ParamAttrs); + NF->setAttributes(Attrs); + + OriginalFunction->replaceAllUsesWith(NF); + return NF; +} + +Function * +ExpandVariadics::deriveFixedArityReplacement(Module &M, IRBuilder<> &Builder, + Function *OriginalFunction) { + Function &F = *OriginalFunction; + // The purpose here is split the variadic function F into two functions + // One is a variadic function that bundles the passed argument into a va_list + // and passes it to the second function. The second function does whatever + // the original F does, except that it takes a va_list instead of the ... + + assert(expansionApplicableToFunction(M, &F)); + + auto &Ctx = M.getContext(); + + // Returned value isDeclaration() is equal to F.isDeclaration() + // but that property is not invariant throughout this function + const bool FunctionIsDefinition = !F.isDeclaration(); + + FunctionType *FTy = F.getFunctionType(); + SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end()); + ArgTypes.push_back(ABI->vaListParameterType(M)); + + FunctionType *NFTy = inlinableVariadicFunctionType(M, FTy); + Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace()); + + // Note - same attribute handling as DeadArgumentElimination + NF->copyAttributesFrom(&F); + NF->setComdat(F.getComdat()); + F.getParent()->getFunctionList().insert(F.getIterator(), NF); + NF->setName(F.getName() + ".valist"); + NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat; + + AttrBuilder ParamAttrs(Ctx); + + AttributeList Attrs = NF->getAttributes(); + Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs); + NF->setAttributes(Attrs); + + // Splice the implementation into the new function with minimal changes + if (FunctionIsDefinition) { + NF->splice(NF->begin(), &F); + + auto NewArg = NF->arg_begin(); + for (Argument &Arg : F.args()) { + Arg.replaceAllUsesWith(NewArg); + NewArg->setName(Arg.getName()); // takeName without killing the old one + ++NewArg; + } + NewArg->setName("varargs"); + } + + SmallVector<std::pair<unsigned, MDNode *>, 1> MDs; + F.getAllMetadata(MDs); + for (auto [KindID, Node] : MDs) + NF->addMetadata(KindID, *Node); + F.clearMetadata(); + + return NF; +} + +Function * +ExpandVariadics::defineVariadicWrapper(Module &M, IRBuilder<> &Builder, + Function *VariadicWrapper, + Function *FixedArityReplacement) { + auto &Ctx = Builder.getContext(); + const DataLayout &DL = M.getDataLayout(); + assert(VariadicWrapper->isDeclaration()); + Function &F = *VariadicWrapper; + + assert(F.isDeclaration()); + Type *VaListTy = ABI->vaListType(Ctx); + + auto *BB = BasicBlock::Create(Ctx, "entry", &F); + Builder.SetInsertPoint(BB); + + AllocaInst *VaListInstance = + Builder.CreateAlloca(VaListTy, nullptr, "va_start"); + + Builder.CreateLifetimeStart(VaListInstance, + sizeOfAlloca(Ctx, DL, VaListInstance)); + + Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)}, + {VaListInstance}); + + SmallVector<Value *> Args; + for (Argument &A : F.args()) + Args.push_back(&A); + + Type *ParameterType = ABI->vaListParameterType(M); + if (ABI->vaListPassedInSSARegister()) + Args.push_back(Builder.CreateLoad(ParameterType, VaListInstance)); + else + Args.push_back(Builder.CreateAddrSpaceCast(VaListInstance, ParameterType)); + + CallInst *Result = Builder.CreateCall(FixedArityReplacement, Args); + + Builder.CreateIntrinsic(Intrinsic::vaend, {DL.getAllocaPtrType(Ctx)}, + {VaListInstance}); + Builder.CreateLifetimeEnd(VaListInstance, + sizeOfAlloca(Ctx, DL, VaListInstance)); + + if (Result->getType()->isVoidTy()) + Builder.CreateRetVoid(); + else + Builder.CreateRet(Result); + + return VariadicWrapper; +} + +bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, + FunctionType *VarargFunctionType, + Function *NF) { + bool Changed = false; + const DataLayout &DL = M.getDataLayout(); + + if (!expansionApplicableToFunctionCall(CB)) { + if (rewriteABI()) + report_fatal_error("Cannot lower callbase instruction"); + return Changed; + } + + // This is tricky. The call instruction's function type might not match + // the type of the caller. When optimising, can leave it unchanged. + // Webassembly detects that inconsistency and repairs it. + FunctionType *FuncType = CB->getFunctionType(); + if (FuncType != VarargFunctionType) { + if (!rewriteABI()) + return Changed; + FuncType = VarargFunctionType; + } + + auto &Ctx = CB->getContext(); + + Align MaxFieldAlign(1); + + // The strategy is to allocate a call frame containing the variadic + // arguments laid out such that a target specific va_list can be initialized + // with it, such that target specific va_arg instructions will correctly + // iterate over it. This means getting the alignment right and sometimes + // embedding a pointer to the value instead of embedding the value itself. + + Function *CBF = CB->getParent()->getParent(); + + ExpandedCallFrame Frame; + + uint64_t CurrentOffset = 0; + + for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) { + Value *ArgVal = CB->getArgOperand(I); + const bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal); + const bool IsByRef = CB->paramHasAttr(I, Attribute::ByRef); + + // The type of the value being passed, decoded from byval/byref metadata if + // required + Type *const UnderlyingType = IsByVal ? CB->getParamByValType(I) + : IsByRef ? CB->getParamByRefType(I) + : ArgVal->getType(); + const uint64_t UnderlyingSize = + DL.getTypeAllocSize(UnderlyingType).getFixedValue(); + + // The type to be written into the call frame + Type *FrameFieldType = UnderlyingType; + + // The value to copy from when initialising the frame alloca + Value *SourceValue = ArgVal; + + VariadicABIInfo::VAArgSlotInfo SlotInfo = ABI->slotInfo(DL, UnderlyingType); + + if (SlotInfo.Indirect) { + // The va_arg lowering loads through a pointer. Set up an alloca to aim + // that pointer at. + Builder.SetInsertPointPastAllocas(CBF); + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + Value *CallerCopy = + Builder.CreateAlloca(UnderlyingType, nullptr, "IndirectAlloca"); + + Builder.SetInsertPoint(CB); + if (IsByVal) + Builder.CreateMemCpy(CallerCopy, {}, ArgVal, {}, UnderlyingSize); + else + Builder.CreateStore(ArgVal, CallerCopy); + + // Indirection now handled, pass the alloca ptr by value + FrameFieldType = DL.getAllocaPtrType(Ctx); + SourceValue = CallerCopy; + } + + // Alignment of the value within the frame + // This probably needs to be controllable as a function of type + Align DataAlign = SlotInfo.DataAlign; + + MaxFieldAlign = std::max(MaxFieldAlign, DataAlign); + + uint64_t DataAlignV = DataAlign.value(); + if (uint64_t Rem = CurrentOffset % DataAlignV) { + // Inject explicit padding to deal with alignment requirements + uint64_t Padding = DataAlignV - Rem; + Frame.padding(Ctx, Padding); + CurrentOffset += Padding; + } + + if (SlotInfo.Indirect) { + Frame.store(Ctx, FrameFieldType, SourceValue); + } else { + if (IsByVal) + Frame.memcpy(Ctx, FrameFieldType, SourceValue, UnderlyingSize); + else + Frame.store(Ctx, FrameFieldType, SourceValue); + } + + CurrentOffset += DL.getTypeAllocSize(FrameFieldType).getFixedValue(); + } + + if (Frame.empty()) { + // Not passing any arguments, hopefully va_arg won't try to read any + // Creating a single byte frame containing nothing to point the va_list + // instance as that is less special-casey in the compiler and probably + // easier to interpret in a debugger. + Frame.padding(Ctx, 1); + } + + StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName()); + + // The struct instance needs to be at least MaxFieldAlign for the alignment of + // the fields to be correct at runtime. Use the native stack alignment instead + // if that's greater as that tends to give better codegen. + // This is an awkward way to guess whether there is a known stack alignment + // without hitting an assert in DL.getStackAlignment, 1024 is an arbitrary + // number likely to be greater than the natural stack alignment. + // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert + Align AllocaAlign = MaxFieldAlign; + if (DL.exceedsNaturalStackAlignment(Align(1024))) + AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment()); + + // Put the alloca to hold the variadic args in the entry basic block. + Builder.SetInsertPointPastAllocas(CBF); + + // SetCurrentDebugLocation when the builder SetInsertPoint method does not + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + + // The awkward construction here is to set the alignment on the instance + AllocaInst *Alloced = Builder.Insert( + new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign), + "vararg_buffer"); + Changed = true; + assert(Alloced->getAllocatedType() == VarargsTy); + + // Initialize the fields in the struct + Builder.SetInsertPoint(CB); + Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + Frame.initializeStructAlloca(DL, Builder, Alloced); + + const unsigned NumArgs = FuncType->getNumParams(); + SmallVector<Value *> Args(CB->arg_begin(), CB->arg_begin() + NumArgs); + + // Initialize a va_list pointing to that struct and pass it as the last + // argument + AllocaInst *VaList = nullptr; + { + if (!ABI->vaListPassedInSSARegister()) { + Type *VaListTy = ABI->vaListType(Ctx); + Builder.SetInsertPointPastAllocas(CBF); + Builder.SetCurrentDebugLocation(CB->getStableDebugLoc()); + VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_argument"); + Builder.SetInsertPoint(CB); + Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList)); + } + Builder.SetInsertPoint(CB); + Args.push_back(ABI->initializeVaList(M, Ctx, Builder, VaList, Alloced)); + } + + // Attributes excluding any on the vararg arguments + AttributeList PAL = CB->getAttributes(); + if (!PAL.isEmpty()) { + SmallVector<AttributeSet, 8> ArgAttrs; + for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++) + ArgAttrs.push_back(PAL.getParamAttrs(ArgNo)); + PAL = + AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs); + } + + SmallVector<OperandBundleDef, 1> OpBundles; + CB->getOperandBundlesAsDefs(OpBundles); + + CallBase *NewCB = nullptr; + + if (CallInst *CI = dyn_cast<CallInst>(CB)) { + Value *Dst = NF ? NF : CI->getCalledOperand(); + FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType); + + NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI); + + CallInst::TailCallKind TCK = CI->getTailCallKind(); + assert(TCK != CallInst::TCK_MustTail); + + // Can't tail call a function that is being passed a pointer to an alloca + if (TCK == CallInst::TCK_Tail) + TCK = CallInst::TCK_None; + CI->setTailCallKind(TCK); + + } else { + llvm_unreachable("Unreachable when !expansionApplicableToFunctionCall()"); + } + + if (VaList) + Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList)); + + Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced)); + + NewCB->setAttributes(PAL); + NewCB->takeName(CB); + NewCB->setCallingConv(CB->getCallingConv()); + NewCB->setDebugLoc(DebugLoc()); + + // DeadArgElim and ArgPromotion copy exactly this metadata + NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg}); + + CB->replaceAllUsesWith(NewCB); + CB->eraseFromParent(); + return Changed; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder, + const DataLayout &DL, + VAStartInst *Inst) { + // Only removing va_start instructions that are not in variadic functions. + // Those would be rejected by the IR verifier before this pass. + // After splicing basic blocks from a variadic function into a fixed arity + // one the va_start that used to refer to the ... parameter still exist. + // There are also variadic functions that this pass did not change and + // va_start instances in the created single block wrapper functions. + // Replace exactly the instances in non-variadic functions as those are + // the ones to be fixed up to use the va_list passed as the final argument. + + Function *ContainingFunction = Inst->getFunction(); + if (ContainingFunction->isVarArg()) { + return false; + } + + // The last argument is a vaListParameterType, either a va_list + // or a pointer to one depending on the target. + bool PassedByValue = ABI->vaListPassedInSSARegister(); + Argument *PassedVaList = + ContainingFunction->getArg(ContainingFunction->arg_size() - 1); + + // va_start takes a pointer to a va_list, e.g. one on the stack + Value *VaStartArg = Inst->getArgList(); + + Builder.SetInsertPoint(Inst); + + if (PassedByValue) { + // The general thing to do is create an alloca, store the va_list argument + // to it, then create a va_copy. When vaCopyIsMemcpy(), this optimises to a + // store to the VaStartArg. + assert(ABI->vaCopyIsMemcpy()); + Builder.CreateStore(PassedVaList, VaStartArg); + } else { + + // Otherwise emit a vacopy to pick up target-specific handling if any + auto &Ctx = Builder.getContext(); + + Builder.CreateIntrinsic(Intrinsic::vacopy, {DL.getAllocaPtrType(Ctx)}, + {VaStartArg, PassedVaList}); + } + + Inst->eraseFromParent(); + return true; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &, + VAEndInst *Inst) { + assert(ABI->vaEndIsNop()); + Inst->eraseFromParent(); + return true; +} + +bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder, + const DataLayout &DL, + VACopyInst *Inst) { + assert(ABI->vaCopyIsMemcpy()); + Builder.SetInsertPoint(Inst); + + auto &Ctx = Builder.getContext(); + Type *VaListTy = ABI->vaListType(Ctx); + uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue(); + + Builder.CreateMemCpy(Inst->getDest(), {}, Inst->getSrc(), {}, + Builder.getInt32(Size)); + + Inst->eraseFromParent(); + return true; +} + +struct Amdgpu final : public VariadicABIInfo { + + bool enableForTarget() override { return true; } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst * /*va_list*/, Value *Buffer) override { + // Given Buffer, which is an AllocInst of vararg_buffer + // need to return something usable as parameter type + return Builder.CreateAddrSpaceCast(Buffer, vaListParameterType(M)); + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + return {Align(4), false}; + } +}; + +struct Wasm final : public VariadicABIInfo { + + bool enableForTarget() override { + // Currently wasm is only used for testing. + return commandLineOverride(); + } + + bool vaListPassedInSSARegister() override { return true; } + + Type *vaListType(LLVMContext &Ctx) override { + return PointerType::getUnqual(Ctx); + } + + Type *vaListParameterType(Module &M) override { + return PointerType::getUnqual(M.getContext()); + } + + Value *initializeVaList(Module &M, LLVMContext &Ctx, IRBuilder<> &Builder, + AllocaInst * /*va_list*/, Value *Buffer) override { + return Buffer; + } + + VAArgSlotInfo slotInfo(const DataLayout &DL, Type *Parameter) override { + LLVMContext &Ctx = Parameter->getContext(); + const unsigned MinAlign = 4; + Align A = DL.getABITypeAlign(Parameter); + if (A < MinAlign) + A = Align(MinAlign); + + if (auto s = dyn_cast<StructType>(Parameter)) { + if (s->getNumElements() > 1) { + return {DL.getABITypeAlign(PointerType::getUnqual(Ctx)), true}; + } + } + + return {A, false}; + } +}; + +std::unique_ptr<VariadicABIInfo> VariadicABIInfo::create(const Triple &T) { + switch (T.getArch()) { + case Triple::r600: + case Triple::amdgcn: { + return std::make_unique<Amdgpu>(); + } + + case Triple::wasm32: { + return std::make_unique<Wasm>(); + } + + default: + return {}; + } +} + +} // namespace + +char ExpandVariadics::ID = 0; + +INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false, + false) + +ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode M) { + return new ExpandVariadics(M); +} + +PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) { + return ExpandVariadics(Mode).runOnModule(M) ? PreservedAnalyses::none() + : PreservedAnalyses::all(); +} + +ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode M) : Mode(M) {} diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp index 03923b8..f033d2b 100644 --- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp +++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp @@ -262,8 +262,70 @@ public: // TODO: Should this be a map (from Caller node) for more efficient lookup? std::vector<std::shared_ptr<ContextEdge>> CallerEdges; - // The set of IDs for contexts including this node. - DenseSet<uint32_t> ContextIds; + // Get the list of edges from which we can compute allocation information + // such as the context ids and allocation type of this node. + const std::vector<std::shared_ptr<ContextEdge>> * + getEdgesWithAllocInfo() const { + // If node has any callees, compute from those, otherwise compute from + // callers (i.e. if this is the leaf allocation node). + if (!CalleeEdges.empty()) + return &CalleeEdges; + if (!CallerEdges.empty()) { + // A node with caller edges but no callee edges must be the allocation + // node. + assert(IsAllocation); + return &CallerEdges; + } + return nullptr; + } + + // Compute the context ids for this node from the union of its edge context + // ids. + DenseSet<uint32_t> getContextIds() const { + DenseSet<uint32_t> ContextIds; + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return {}; + unsigned Count = 0; + for (auto &Edge : *Edges) + Count += Edge->getContextIds().size(); + ContextIds.reserve(Count); + for (auto &Edge : *Edges) + ContextIds.insert(Edge->getContextIds().begin(), + Edge->getContextIds().end()); + return ContextIds; + } + + // Compute the allocation type for this node from the OR of its edge + // allocation types. + uint8_t computeAllocType() const { + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return (uint8_t)AllocationType::None; + uint8_t BothTypes = + (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; + uint8_t AllocType = (uint8_t)AllocationType::None; + for (auto &Edge : *Edges) { + AllocType |= Edge->AllocTypes; + // Bail early if alloc type reached both, no further refinement. + if (AllocType == BothTypes) + return AllocType; + } + return AllocType; + } + + // The context ids set for this node is empty if its edge context ids are + // also all empty. + bool emptyContextIds() const { + auto *Edges = getEdgesWithAllocInfo(); + if (!Edges) + return true; + for (auto &Edge : *Edges) { + if (!Edge->getContextIds().empty()) + return false; + } + return true; + } // List of clones of this ContextNode, initially empty. std::vector<ContextNode *> Clones; @@ -308,15 +370,11 @@ public: void printCall(raw_ostream &OS) const { Call.print(OS); } // True if this node was effectively removed from the graph, in which case - // its context id set, caller edges, and callee edges should all be empty. + // it should have an allocation type of None and empty context ids. bool isRemoved() const { - // Note that we can have non-empty context ids with empty caller and - // callee edges if the graph ends up with a single node. - if (ContextIds.empty()) - assert(CalleeEdges.empty() && CallerEdges.empty() && - "Context ids empty but at least one of callee and caller edges " - "were not!"); - return ContextIds.empty(); + assert((AllocTypes == (uint8_t)AllocationType::None) == + emptyContextIds()); + return AllocTypes == (uint8_t)AllocationType::None; } void dump() const; @@ -429,7 +487,8 @@ private: /// else to its callers. Also updates OrigNode's edges to remove any context /// ids moved to the newly created edge. void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, - bool TowardsCallee); + bool TowardsCallee, + DenseSet<uint32_t> RemainingContextIds); /// Get the stack id corresponding to the given Id or Index (for IR this will /// return itself, for a summary index this will return the id recorded in the @@ -958,7 +1017,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( // Update alloc type and context ids for this MIB. AllocNode->AllocTypes |= (uint8_t)AllocType; - AllocNode->ContextIds.insert(LastContextId); // Now add or update nodes for each stack id in alloc's context. // Later when processing the stack ids on non-alloc callsites we will adjust @@ -983,7 +1041,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( auto Ins = StackIdSet.insert(StackId); if (!Ins.second) StackNode->Recursive = true; - StackNode->ContextIds.insert(LastContextId); StackNode->AllocTypes |= (uint8_t)AllocType; PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); PrevNode = StackNode; @@ -1034,7 +1091,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // it resulted in any added ids to NextNode. if (!NewIdsToAdd.empty()) { Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); - NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); UpdateCallers(NextNode, Visited, UpdateCallers); } } @@ -1043,21 +1099,16 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: DenseSet<const ContextEdge *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) { auto *Node = Entry.second; - // Update ids on the allocation nodes before calling the recursive - // update along caller edges, since this simplifies the logic during - // that traversal. - DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); - Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); UpdateCallers(Node, Visited, UpdateCallers); } } template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( - ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { - // Make a copy of the context ids, since this will be adjusted below as they - // are moved. - DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; + ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, + // This must be passed by value to make a copy since it will be adjusted + // as ids are moved. + DenseSet<uint32_t> RemainingContextIds) { auto &OrigEdges = TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; // Increment iterator in loop so that we can remove edges as needed. @@ -1104,6 +1155,51 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( } template <typename DerivedCCG, typename FuncTy, typename CallTy> +static void checkEdge( + const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { + // Confirm that alloc type is not None and that we have at least one context + // id. + assert(Edge->AllocTypes != (uint8_t)AllocationType::None); + assert(!Edge->ContextIds.empty()); +} + +template <typename DerivedCCG, typename FuncTy, typename CallTy> +static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, + bool CheckEdges = true) { + if (Node->isRemoved()) + return; +#ifndef NDEBUG + // Compute node's context ids once for use in asserts. + auto NodeContextIds = Node->getContextIds(); +#endif + // Node's context ids should be the union of both its callee and caller edge + // context ids. + if (Node->CallerEdges.size()) { + DenseSet<uint32_t> CallerEdgeContextIds( + Node->CallerEdges.front()->ContextIds); + for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { + if (CheckEdges) + checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); + set_union(CallerEdgeContextIds, Edge->ContextIds); + } + // Node can have more context ids than callers if some contexts terminate at + // node and some are longer. + assert(NodeContextIds == CallerEdgeContextIds || + set_is_subset(CallerEdgeContextIds, NodeContextIds)); + } + if (Node->CalleeEdges.size()) { + DenseSet<uint32_t> CalleeEdgeContextIds( + Node->CalleeEdges.front()->ContextIds); + for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { + if (CheckEdges) + checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); + set_union(CalleeEdgeContextIds, Edge->getContextIds()); + } + assert(NodeContextIds == CalleeEdgeContextIds); + } +} + +template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: assignStackNodesPostOrder(ContextNode *Node, DenseSet<const ContextNode *> &Visited, @@ -1178,7 +1274,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // duplicated context ids. We have to recompute as we might have overlap // overlap between the saved context ids for different last nodes, and // removed them already during the post order traversal. - set_intersect(SavedContextIds, FirstNode->ContextIds); + set_intersect(SavedContextIds, FirstNode->getContextIds()); ContextNode *PrevNode = nullptr; for (auto Id : Ids) { ContextNode *CurNode = getNodeForStackId(Id); @@ -1211,18 +1307,17 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: ContextNode *NewNode = NodeOwner.back().get(); NodeToCallingFunc[NewNode] = Func; NonAllocationCallToContextNodeMap[Call] = NewNode; - NewNode->ContextIds = SavedContextIds; - NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); + NewNode->AllocTypes = computeAllocType(SavedContextIds); // Connect to callees of innermost stack frame in inlined call chain. // This updates context ids for FirstNode's callee's to reflect those // moved to NewNode. - connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); + connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds); // Connect to callers of outermost stack frame in inlined call chain. // This updates context ids for FirstNode's caller's to reflect those // moved to NewNode. - connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); + connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds); // Now we need to remove context ids from edges/nodes between First and // Last Node. @@ -1234,18 +1329,32 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: // Remove the context ids moved to NewNode from CurNode, and the // edge from the prior node. - set_subtract(CurNode->ContextIds, NewNode->ContextIds); if (PrevNode) { auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); assert(PrevEdge); - set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); + set_subtract(PrevEdge->getContextIds(), SavedContextIds); if (PrevEdge->getContextIds().empty()) { PrevNode->eraseCallerEdge(PrevEdge); CurNode->eraseCalleeEdge(PrevEdge); } } + // Since we update the edges from leaf to tail, only look at the callee + // edges. This isn't an alloc node, so if there are no callee edges, the + // alloc type is None. + CurNode->AllocTypes = CurNode->CalleeEdges.empty() + ? (uint8_t)AllocationType::None + : CurNode->computeAllocType(); PrevNode = CurNode; } + if (VerifyNodes) { + checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); + for (auto Id : Ids) { + ContextNode *CurNode = getNodeForStackId(Id); + // We should only have kept stack ids that had nodes. + assert(CurNode); + checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); + } + } } } @@ -1319,7 +1428,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { // Initialize the context ids with the last node's. We will subsequently // refine the context ids by computing the intersection along all edges. - DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; + DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); assert(!LastNodeContextIds.empty()); for (unsigned I = 0; I < Calls.size(); I++) { @@ -1442,6 +1551,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { DenseSet<const ContextNode *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); + if (VerifyCCG) + check(); } uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { @@ -1786,8 +1897,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( // First check if we have already synthesized a node for this tail call. if (TailCallToContextNodeMap.count(NewCall)) { NewNode = TailCallToContextNodeMap[NewCall]; - NewNode->ContextIds.insert(Edge->ContextIds.begin(), - Edge->ContextIds.end()); NewNode->AllocTypes |= Edge->AllocTypes; } else { FuncToCallsWithMetadata[Func].push_back({NewCall}); @@ -1797,7 +1906,6 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( NewNode = NodeOwner.back().get(); NodeToCallingFunc[NewNode] = Func; TailCallToContextNodeMap[NewCall] = NewNode; - NewNode->ContextIds = Edge->ContextIds; NewNode->AllocTypes = Edge->AllocTypes; } @@ -2091,6 +2199,8 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( OS << "\n"; OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; OS << "\tContextIds:"; + // Make a copy of the computed context ids that we can sort for stability. + auto ContextIds = getContextIds(); std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); std::sort(SortedIds.begin(), SortedIds.end()); for (auto Id : SortedIds) @@ -2151,53 +2261,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( } template <typename DerivedCCG, typename FuncTy, typename CallTy> -static void checkEdge( - const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { - // Confirm that alloc type is not None and that we have at least one context - // id. - assert(Edge->AllocTypes != (uint8_t)AllocationType::None); - assert(!Edge->ContextIds.empty()); -} - -template <typename DerivedCCG, typename FuncTy, typename CallTy> -static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, - bool CheckEdges = true) { - if (Node->isRemoved()) - return; - // Node's context ids should be the union of both its callee and caller edge - // context ids. - if (Node->CallerEdges.size()) { - auto EI = Node->CallerEdges.begin(); - auto &FirstEdge = *EI; - EI++; - DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); - for (; EI != Node->CallerEdges.end(); EI++) { - const auto &Edge = *EI; - if (CheckEdges) - checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); - set_union(CallerEdgeContextIds, Edge->ContextIds); - } - // Node can have more context ids than callers if some contexts terminate at - // node and some are longer. - assert(Node->ContextIds == CallerEdgeContextIds || - set_is_subset(CallerEdgeContextIds, Node->ContextIds)); - } - if (Node->CalleeEdges.size()) { - auto EI = Node->CalleeEdges.begin(); - auto &FirstEdge = *EI; - EI++; - DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); - for (; EI != Node->CalleeEdges.end(); EI++) { - const auto &Edge = *EI; - if (CheckEdges) - checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); - set_union(CalleeEdgeContextIds, Edge->ContextIds); - } - assert(Node->ContextIds == CalleeEdgeContextIds); - } -} - -template <typename DerivedCCG, typename FuncTy, typename CallTy> void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; for (const auto Node : nodes<GraphType>(this)) { @@ -2284,7 +2347,7 @@ struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> static std::string getNodeAttributes(NodeRef Node, GraphType) { std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + - getContextIds(Node->ContextIds) + "\"") + getContextIds(Node->getContextIds()) + "\"") .str(); AttributeString += (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); @@ -2443,16 +2506,6 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: set_subtract(Edge->ContextIds, ContextIdsToMove); Edge->AllocTypes = computeAllocType(Edge->ContextIds); } - // Now perform some updates that are common to all cases: the NewCallee gets - // the moved ids added, and we need to remove those ids from OldCallee and - // update its alloc type (NewCallee alloc type updates handled above). - NewCallee->ContextIds.insert(ContextIdsToMove.begin(), - ContextIdsToMove.end()); - set_subtract(OldCallee->ContextIds, ContextIdsToMove); - OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); - // OldCallee alloc type should be None iff its context id set is now empty. - assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == - OldCallee->ContextIds.empty()); // Now walk the old callee node's callee edges and move Edge's context ids // over to the corresponding edge into the clone (which is created here if // this is a newly created clone). @@ -2484,6 +2537,12 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: NewCallee->CalleeEdges.push_back(NewEdge); NewEdge->Callee->CallerEdges.push_back(NewEdge); } + // Recompute the node alloc type now that its callee edges have been + // updated (since we will compute from those edges). + OldCallee->AllocTypes = OldCallee->computeAllocType(); + // OldCallee alloc type should be None iff its context id set is now empty. + assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == + OldCallee->emptyContextIds()); if (VerifyCCG) { checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); @@ -2528,7 +2587,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { DenseSet<const ContextNode *> Visited; for (auto &Entry : AllocationCallToContextNodeMap) { Visited.clear(); - identifyClones(Entry.second, Visited, Entry.second->ContextIds); + identifyClones(Entry.second, Visited, Entry.second->getContextIds()); } Visited.clear(); for (auto &Entry : AllocationCallToContextNodeMap) @@ -2714,7 +2773,7 @@ void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( } // We should still have some context ids on the original Node. - assert(!Node->ContextIds.empty()); + assert(!Node->emptyContextIds()); // Sanity check that no alloc types on node or edges are None. assert(Node->AllocTypes != (uint8_t)AllocationType::None); @@ -2918,7 +2977,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // find additional cloning is required. std::deque<ContextNode *> ClonesWorklist; // Ignore original Node if we moved all of its contexts to clones. - if (!Node->ContextIds.empty()) + if (!Node->emptyContextIds()) ClonesWorklist.push_back(Node); ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), Node->Clones.end()); @@ -3258,7 +3317,7 @@ bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { // Skip if either no call to update, or if we ended up with no context ids // (we moved all edges onto other clones). - if (!Node->hasCall() || Node->ContextIds.empty()) + if (!Node->hasCall() || Node->emptyContextIds()) return; if (Node->IsAllocation) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 89193f8..38c1c26 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -4745,6 +4745,29 @@ static Instruction *foldICmpAndXX(ICmpInst &I, const SimplifyQuery &Q, Constant::getNullValue(Op1->getType())); } + if (!ICmpInst::isSigned(Pred)) + return nullptr; + + KnownBits KnownY = IC.computeKnownBits(A, /*Depth=*/0, &I); + // (X & NegY) spred X --> (X & NegY) upred X + if (KnownY.isNegative()) + return new ICmpInst(ICmpInst::getUnsignedPredicate(Pred), Op0, Op1); + + if (Pred != ICmpInst::ICMP_SLE && Pred != ICmpInst::ICMP_SGT) + return nullptr; + + if (KnownY.isNonNegative()) + // (X & PosY) s<= X --> X s>= 0 + // (X & PosY) s> X --> X s< 0 + return new ICmpInst(ICmpInst::getSwappedPredicate(Pred), Op1, + Constant::getNullValue(Op1->getType())); + + if (isKnownNegative(Op1, IC.getSimplifyQuery().getWithInstruction(&I))) + // (NegX & Y) s<= NegX --> Y s< 0 + // (NegX & Y) s> NegX --> Y s>= 0 + return new ICmpInst(ICmpInst::getFlippedStrictnessPredicate(Pred), A, + Constant::getNullValue(A->getType())); + return nullptr; } @@ -4772,7 +4795,7 @@ static Instruction *foldICmpOrXX(ICmpInst &I, const SimplifyQuery &Q, if (ICmpInst::isEquality(Pred) && Op0->hasOneUse()) { // icmp (X | Y) eq/ne Y --> (X & ~Y) eq/ne 0 if Y is freely invertible if (Value *NotOp1 = - IC.getFreelyInverted(Op1, Op1->hasOneUse(), &IC.Builder)) + IC.getFreelyInverted(Op1, !Op1->hasNUsesOrMore(3), &IC.Builder)) return new ICmpInst(Pred, IC.Builder.CreateAnd(A, NotOp1), Constant::getNullValue(Op1->getType())); // icmp (X | Y) eq/ne Y --> (~X | Y) eq/ne -1 if X is freely invertible. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 313beb7..d2aaa5e 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1294,8 +1294,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, // X == Y ? X : Z with X == Y ? Y : Z, as that would lead to an infinite // replacement cycle. Value *CmpLHS = Cmp.getOperand(0), *CmpRHS = Cmp.getOperand(1); - if (TrueVal != CmpLHS && - isGuaranteedNotToBeUndefOrPoison(CmpRHS, SQ.AC, &Sel, &DT)) { + if (TrueVal != CmpLHS && isGuaranteedNotToBeUndef(CmpRHS, SQ.AC, &Sel, &DT)) { if (Value *V = simplifyWithOpReplaced(TrueVal, CmpLHS, CmpRHS, SQ, /* AllowRefinement */ true)) // Require either the replacement or the simplification result to be a @@ -1316,8 +1315,7 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel, if (replaceInInstruction(TrueVal, CmpLHS, CmpRHS)) return &Sel; } - if (TrueVal != CmpRHS && - isGuaranteedNotToBeUndefOrPoison(CmpLHS, SQ.AC, &Sel, &DT)) + if (TrueVal != CmpRHS && isGuaranteedNotToBeUndef(CmpLHS, SQ.AC, &Sel, &DT)) if (Value *V = simplifyWithOpReplaced(TrueVal, CmpRHS, CmpLHS, SQ, /* AllowRefinement */ true)) if (isa<Constant>(CmpLHS) || isa<Constant>(V)) diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp index 2aa2175..a0e63bf1 100644 --- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp @@ -337,13 +337,17 @@ private: unsigned AccessSizeIndex, Instruction *InsertBefore, DomTreeUpdater &DTU, LoopInfo *LI); - bool ignoreMemIntrinsic(MemIntrinsic *MI); + bool ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, MemIntrinsic *MI); void instrumentMemIntrinsic(MemIntrinsic *MI); bool instrumentMemAccess(InterestingMemoryOperand &O, DomTreeUpdater &DTU, LoopInfo *LI); - bool ignoreAccess(Instruction *Inst, Value *Ptr); + bool ignoreAccessWithoutRemark(Instruction *Inst, Value *Ptr); + bool ignoreAccess(OptimizationRemarkEmitter &ORE, Instruction *Inst, + Value *Ptr); + void getInterestingMemoryOperands( - Instruction *I, const TargetLibraryInfo &TLI, + OptimizationRemarkEmitter &ORE, Instruction *I, + const TargetLibraryInfo &TLI, SmallVectorImpl<InterestingMemoryOperand> &Interesting); void tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size); @@ -765,7 +769,8 @@ Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) { return IRB.CreateLoad(PtrTy, GlobalDynamicAddress); } -bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) { +bool HWAddressSanitizer::ignoreAccessWithoutRemark(Instruction *Inst, + Value *Ptr) { // Do not instrument accesses from different address spaces; we cannot deal // with them. Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType()); @@ -795,8 +800,23 @@ bool HWAddressSanitizer::ignoreAccess(Instruction *Inst, Value *Ptr) { return false; } +bool HWAddressSanitizer::ignoreAccess(OptimizationRemarkEmitter &ORE, + Instruction *Inst, Value *Ptr) { + bool Ignored = ignoreAccessWithoutRemark(Inst, Ptr); + if (Ignored) { + ORE.emit( + [&]() { return OptimizationRemark(DEBUG_TYPE, "ignoreAccess", Inst); }); + } else { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "ignoreAccess", Inst); + }); + } + return Ignored; +} + void HWAddressSanitizer::getInterestingMemoryOperands( - Instruction *I, const TargetLibraryInfo &TLI, + OptimizationRemarkEmitter &ORE, Instruction *I, + const TargetLibraryInfo &TLI, SmallVectorImpl<InterestingMemoryOperand> &Interesting) { // Skip memory accesses inserted by another instrumentation. if (I->hasMetadata(LLVMContext::MD_nosanitize)) @@ -807,22 +827,22 @@ void HWAddressSanitizer::getInterestingMemoryOperands( return; if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (!ClInstrumentReads || ignoreAccess(I, LI->getPointerOperand())) + if (!ClInstrumentReads || ignoreAccess(ORE, I, LI->getPointerOperand())) return; Interesting.emplace_back(I, LI->getPointerOperandIndex(), false, LI->getType(), LI->getAlign()); } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (!ClInstrumentWrites || ignoreAccess(I, SI->getPointerOperand())) + if (!ClInstrumentWrites || ignoreAccess(ORE, I, SI->getPointerOperand())) return; Interesting.emplace_back(I, SI->getPointerOperandIndex(), true, SI->getValueOperand()->getType(), SI->getAlign()); } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { - if (!ClInstrumentAtomics || ignoreAccess(I, RMW->getPointerOperand())) + if (!ClInstrumentAtomics || ignoreAccess(ORE, I, RMW->getPointerOperand())) return; Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true, RMW->getValOperand()->getType(), std::nullopt); } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { - if (!ClInstrumentAtomics || ignoreAccess(I, XCHG->getPointerOperand())) + if (!ClInstrumentAtomics || ignoreAccess(ORE, I, XCHG->getPointerOperand())) return; Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true, XCHG->getCompareOperand()->getType(), @@ -830,7 +850,7 @@ void HWAddressSanitizer::getInterestingMemoryOperands( } else if (auto *CI = dyn_cast<CallInst>(I)) { for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ArgNo++) { if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) || - ignoreAccess(I, CI->getArgOperand(ArgNo))) + ignoreAccess(ORE, I, CI->getArgOperand(ArgNo))) continue; Type *Ty = CI->getParamByValType(ArgNo); Interesting.emplace_back(I, ArgNo, false, Ty, Align(1)); @@ -1035,13 +1055,14 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite, ->setSuccessor(0, TCI.TagMismatchTerm->getParent()); } -bool HWAddressSanitizer::ignoreMemIntrinsic(MemIntrinsic *MI) { +bool HWAddressSanitizer::ignoreMemIntrinsic(OptimizationRemarkEmitter &ORE, + MemIntrinsic *MI) { if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) { - return (!ClInstrumentWrites || ignoreAccess(MTI, MTI->getDest())) && - (!ClInstrumentReads || ignoreAccess(MTI, MTI->getSource())); + return (!ClInstrumentWrites || ignoreAccess(ORE, MTI, MTI->getDest())) && + (!ClInstrumentReads || ignoreAccess(ORE, MTI, MTI->getSource())); } if (isa<MemSetInst>(MI)) - return !ClInstrumentWrites || ignoreAccess(MI, MI->getDest()); + return !ClInstrumentWrites || ignoreAccess(ORE, MI, MI->getDest()); return false; } @@ -1541,6 +1562,9 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, NumTotalFuncs++; + OptimizationRemarkEmitter &ORE = + FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); + if (selectiveInstrumentationShouldSkip(F, FAM)) return; @@ -1562,10 +1586,10 @@ void HWAddressSanitizer::sanitizeFunction(Function &F, if (InstrumentLandingPads && isa<LandingPadInst>(Inst)) LandingPadVec.push_back(&Inst); - getInterestingMemoryOperands(&Inst, TLI, OperandsToInstrument); + getInterestingMemoryOperands(ORE, &Inst, TLI, OperandsToInstrument); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) - if (!ignoreMemIntrinsic(MI)) + if (!ignoreMemIntrinsic(ORE, MI)) IntrinToInstrument.push_back(MI); } diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp index ba2546b..4371b82 100644 --- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp @@ -827,7 +827,8 @@ private: return false; } - if (Metrics.convergent) { + // FIXME: Allow jump threading with controlled convergence. + if (Metrics.Convergence != ConvergenceKind::None) { LLVM_DEBUG(dbgs() << "DFA Jump Threading: Not jump threading, contains " << "convergent instructions.\n"); ORE->emit([&]() { diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp index 7b4c543..f8e2f1f 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp @@ -327,8 +327,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, UnrollCostEstimator OuterUCE(L, TTI, EphValues, UP.BEInsns); if (!InnerUCE.canUnroll() || !OuterUCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable\n"); return LoopUnrollResult::Unmodified; } @@ -341,7 +340,10 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return LoopUnrollResult::Unmodified; } - if (InnerUCE.Convergent || OuterUCE.Convergent) { + // FIXME: The call to canUnroll() allows some controlled convergent + // operations, but we block them here for future changes. + if (InnerUCE.Convergence != ConvergenceKind::None || + OuterUCE.Convergence != ConvergenceKind::None) { LLVM_DEBUG( dbgs() << " Not unrolling loop with convergent instructions.\n"); return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index 10fc9e9..cbc35b6 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -684,11 +684,15 @@ UnrollCostEstimator::UnrollCostEstimator( const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) { CodeMetrics Metrics; for (BasicBlock *BB : L->blocks()) - Metrics.analyzeBasicBlock(BB, TTI, EphValues); + Metrics.analyzeBasicBlock(BB, TTI, EphValues, /* PrepareForLTO= */ false, + L); NumInlineCandidates = Metrics.NumInlineCandidates; NotDuplicatable = Metrics.notDuplicatable; - Convergent = Metrics.convergent; + Convergence = Metrics.Convergence; LoopSize = Metrics.NumInsts; + ConvergenceAllowsRuntime = + Metrics.Convergence != ConvergenceKind::Uncontrolled && + !getLoopConvergenceHeart(L); // Don't allow an estimate of size zero. This would allows unrolling of loops // with huge iteration counts, which is a compile time problem even if it's @@ -701,6 +705,25 @@ UnrollCostEstimator::UnrollCostEstimator( LoopSize = BEInsns + 1; } +bool UnrollCostEstimator::canUnroll() const { + switch (Convergence) { + case ConvergenceKind::ExtendedLoop: + LLVM_DEBUG(dbgs() << " Convergence prevents unrolling.\n"); + return false; + default: + break; + } + if (!LoopSize.isValid()) { + LLVM_DEBUG(dbgs() << " Invalid loop size prevents unrolling.\n"); + return false; + } + if (NotDuplicatable) { + LLVM_DEBUG(dbgs() << " Non-duplicatable blocks prevent unrolling.\n"); + return false; + } + return true; +} + uint64_t UnrollCostEstimator::getUnrolledLoopSize( const TargetTransformInfo::UnrollingPreferences &UP, unsigned CountOverwrite) const { @@ -1206,8 +1229,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns); if (!UCE.canUnroll()) { - LLVM_DEBUG(dbgs() << " Not unrolling loop which contains instructions" - << " which cannot be duplicated or have invalid cost.\n"); + LLVM_DEBUG(dbgs() << " Loop not considered unrollable.\n"); return LoopUnrollResult::Unmodified; } @@ -1254,15 +1276,9 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // is unsafe -- it adds a control-flow dependency to the convergent // operation. Therefore restrict remainder loop (try unrolling without). // - // TODO: This is quite conservative. In practice, convergent_op() - // is likely to be called unconditionally in the loop. In this - // case, the program would be ill-formed (on most architectures) - // unless n were the same on all threads in a thread group. - // Assuming n is the same on all threads, any kind of unrolling is - // safe. But currently llvm's notion of convergence isn't powerful - // enough to express this. - if (UCE.Convergent) - UP.AllowRemainder = false; + // TODO: This is somewhat conservative; we could allow the remainder if the + // trip count is uniform. + UP.AllowRemainder &= UCE.ConvergenceAllowsRuntime; // Try to find the trip count upper bound if we cannot find the exact trip // count. @@ -1282,6 +1298,8 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, if (!UP.Count) return LoopUnrollResult::Unmodified; + UP.Runtime &= UCE.ConvergenceAllowsRuntime; + if (PP.PeelCount) { assert(UP.Count == 1 && "Cannot perform peel and unroll in the same step"); LLVM_DEBUG(dbgs() << "PEELING loop %" << L->getHeader()->getName() @@ -1324,11 +1342,16 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE, // Unroll the loop. Loop *RemainderLoop = nullptr; + UnrollLoopOptions ULO; + ULO.Count = UP.Count; + ULO.Force = UP.Force; + ULO.AllowExpensiveTripCount = UP.AllowExpensiveTripCount; + ULO.UnrollRemainder = UP.UnrollRemainder; + ULO.Runtime = UP.Runtime; + ULO.ForgetAllSCEV = ForgetAllSCEV; + ULO.Heart = getLoopConvergenceHeart(L); LoopUnrollResult UnrollResult = UnrollLoop( - L, - {UP.Count, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UP.UnrollRemainder, ForgetAllSCEV}, - LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); + L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index eb471b2..cfe6349 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1221,7 +1221,6 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap, SmallPtrSet<const Value *, 4> ObjSet; SmallVector<Metadata *, 4> Scopes, NoAliases; - SmallSetVector<const Argument *, 4> NAPtrArgs; for (const Value *V : PtrArgs) { SmallVector<const Value *, 4> Objects; getUnderlyingObjects(V, Objects, /* LI = */ nullptr); diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp index 08ba65d..3d950b1 100644 --- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp @@ -460,7 +460,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { L->dump()); return Rotated; } - if (Metrics.convergent) { + if (Metrics.Convergence != ConvergenceKind::None) { LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " "instructions: "; L->dump()); diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index 1216538..90d7b99 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -419,6 +419,26 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI, } } +// Loops containing convergent instructions that are uncontrolled or controlled +// from outside the loop must have a count that divides their TripMultiple. +LLVM_ATTRIBUTE_USED +static bool canHaveUnrollRemainder(const Loop *L) { + if (getLoopConvergenceHeart(L)) + return false; + + // Check for uncontrolled convergent operations. + for (auto &BB : L->blocks()) { + for (auto &I : *BB) { + if (isa<ConvergenceControlInst>(I)) + return true; + if (auto *CB = dyn_cast<CallBase>(&I)) + if (CB->isConvergent()) + return CB->getConvergenceControlToken(); + } + } + return true; +} + /// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling /// can only fail when the loop's latch block is not terminated by a conditional /// branch instruction. However, if the trip count (and multiple) are not known, @@ -564,19 +584,8 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, return LoopUnrollResult::Unmodified; } - // Loops containing convergent instructions cannot use runtime unrolling, - // as the prologue/epilogue may add additional control-dependencies to - // convergent operations. - LLVM_DEBUG( - { - bool HasConvergent = false; - for (auto &BB : L->blocks()) - for (auto &I : *BB) - if (auto *CB = dyn_cast<CallBase>(&I)) - HasConvergent |= CB->isConvergent(); - assert((!HasConvergent || !ULO.Runtime) && - "Can't runtime unroll if loop contains a convergent operation."); - }); + assert((!ULO.Runtime || canHaveUnrollRemainder(L)) && + "Can't runtime unroll if loop contains a convergent operation."); bool EpilogProfitability = UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog @@ -722,7 +731,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, if (OldLoop) LoopsToSimplify.insert(NewLoops[OldLoop]); - if (*BB == Header) + if (*BB == Header) { // Loop over all of the PHI nodes in the block, changing them to use // the incoming values from the previous block. for (PHINode *OrigPHI : OrigPHINode) { @@ -735,6 +744,16 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI, NewPHI->eraseFromParent(); } + // Eliminate copies of the loop heart intrinsic, if any. + if (ULO.Heart) { + auto it = VMap.find(ULO.Heart); + assert(it != VMap.end()); + Instruction *heartCopy = cast<Instruction>(it->second); + heartCopy->eraseFromParent(); + VMap.erase(it); + } + } + // Update our running map of newest clones LastValueMap[*BB] = New; for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end(); diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp index e1af028..dd7150b 100644 --- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -1016,12 +1016,17 @@ bool llvm::UnrollRuntimeLoopRemainder( auto UnrollResult = LoopUnrollResult::Unmodified; if (remainderLoop && UnrollRemainder) { LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n"); - UnrollResult = - UnrollLoop(remainderLoop, - {/*Count*/ Count - 1, /*Force*/ false, /*Runtime*/ false, - /*AllowExpensiveTripCount*/ false, - /*UnrollRemainder*/ false, ForgetAllSCEV}, - LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); + UnrollLoopOptions ULO; + ULO.Count = Count - 1; + ULO.Force = false; + ULO.Runtime = false; + ULO.AllowExpensiveTripCount = false; + ULO.UnrollRemainder = false; + ULO.ForgetAllSCEV = ForgetAllSCEV; + assert(!getLoopConvergenceHeart(L) && + "A loop with a convergence heart does not allow runtime unrolling."); + UnrollResult = UnrollLoop(remainderLoop, ULO, LI, SE, DT, AC, TTI, + /*ORE*/ nullptr, PreserveLCSSA); } if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled) diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index 0587468..d6b4acb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -274,6 +274,13 @@ m_Mul(const Op0_t &Op0, const Op1_t &Op1) { return m_Binary<Instruction::Mul, Op0_t, Op1_t>(Op0, Op1); } +template <typename Op0_t, typename Op1_t> +inline AllBinaryRecipe_match<Op0_t, Op1_t, Instruction::Mul, + /* Commutative =*/true> +m_c_Mul(const Op0_t &Op0, const Op1_t &Op1) { + return m_Binary<Instruction::Mul, Op0_t, Op1_t, true>(Op0, Op1); +} + /// Match a binary OR operation. Note that while conceptually the operands can /// be matched commutatively, \p Commutative defaults to false in line with the /// IR-based pattern matching infrastructure. Use m_c_BinaryOr for a commutative diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index ab3b5cf..8ec67eb 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1037,8 +1037,7 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) { return; } - if (match(&R, m_CombineOr(m_Mul(m_VPValue(A), m_SpecificInt(1)), - m_Mul(m_SpecificInt(1), m_VPValue(A))))) + if (match(&R, m_c_Mul(m_VPValue(A), m_SpecificInt(1)))) return R.getVPSingleValue()->replaceAllUsesWith(A); } diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll index eb55e6a..ecf1332 100644 --- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll @@ -267,7 +267,228 @@ for.end: ; preds = %for.body, %entry ret void } +define void @ne_nsw_pos_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nsw_pos_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nsw_pos_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %pos_step = icmp sgt i32 %s, 0 + call void @llvm.assume(i1 %pos_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nsw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nsw_neg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nsw_neg_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nsw_neg_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %neg_step = icmp slt i32 %s, 0 + call void @llvm.assume(i1 %neg_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nsw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nsw_nonneg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nsw_nonneg_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nsw_nonneg_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %nonneg_step = icmp sge i32 %s, 0 + call void @llvm.assume(i1 %nonneg_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nsw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nsw_unknown_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nsw_unknown_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nsw_unknown_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nsw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nuw_pos_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nuw_pos_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nuw_pos_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %pos_step = icmp sgt i32 %s, 0 + call void @llvm.assume(i1 %pos_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nuw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nuw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nuw_neg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nuw_neg_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nuw_neg_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %neg_step = icmp slt i32 %s, 0 + call void @llvm.assume(i1 %neg_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nuw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nuw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nuw_nonneg_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nuw_nonneg_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nuw_nonneg_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %nonneg_step = icmp sge i32 %s, 0 + call void @llvm.assume(i1 %nonneg_step) + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nuw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nuw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +define void @ne_nuw_unknown_step(ptr nocapture %A, i32 %n, i32 %s) mustprogress { +; +; CHECK-LABEL: 'ne_nuw_unknown_step' +; CHECK-NEXT: Determining loop execution counts for: @ne_nuw_unknown_step +; CHECK-NEXT: Loop %for.body: Unpredictable backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable constant max backedge-taken count. +; CHECK-NEXT: Loop %for.body: Unpredictable symbolic max backedge-taken count. +; +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.05 + %0 = load i32, ptr %arrayidx, align 4 + %inc = add nuw i32 %0, 1 + store i32 %inc, ptr %arrayidx, align 4 + %add = add nuw i32 %i.05, %s + %cmp = icmp ne i32 %add, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + +declare void @llvm.assume(i1) !8 = distinct !{!8, !9} !9 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir new file mode 100644 index 0000000..9b7a449 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-vscale.mir @@ -0,0 +1,113 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -o - -mtriple=aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s + +... +--- +name: sum_of_vscale +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: sum_of_vscale + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %sum:_(s64) = G_VSCALE i64 20 + ; CHECK-NEXT: $x0 = COPY %sum(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %rhs:_(s64) = G_VSCALE i64 11 + %lhs:_(s64) = G_VSCALE i64 9 + %sum:_(s64) = nsw G_ADD %lhs(s64), %rhs(s64) + $x0 = COPY %sum(s64) + RET_ReallyLR implicit $x0 +... +--- +name: sum_of_vscale_multi_use +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: sum_of_vscale_multi_use + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %rhs:_(s64) = G_VSCALE i64 11 + ; CHECK-NEXT: %lhs:_(s64) = G_VSCALE i64 9 + ; CHECK-NEXT: %sum:_(s64) = nsw G_ADD %lhs, %rhs + ; CHECK-NEXT: $x0 = COPY %sum(s64) + ; CHECK-NEXT: $x1 = COPY %rhs(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %rhs:_(s64) = G_VSCALE i64 11 + %lhs:_(s64) = G_VSCALE i64 9 + %sum:_(s64) = nsw G_ADD %lhs(s64), %rhs(s64) + $x0 = COPY %sum(s64) + $x1 = COPY %rhs(s64) + RET_ReallyLR implicit $x0 +... +--- +name: mul_of_vscale +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: mul_of_vscale + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %mul:_(s64) = G_VSCALE i64 99 + ; CHECK-NEXT: $x0 = COPY %mul(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %rhs:_(s64) = G_CONSTANT i64 11 + %lhs:_(s64) = G_VSCALE i64 9 + %mul:_(s64) = nsw G_MUL %lhs(s64), %rhs(s64) + $x0 = COPY %mul(s64) + RET_ReallyLR implicit $x0 +... +--- +name: sub_of_vscale +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: sub_of_vscale + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %x:_(s64) = COPY $x0 + ; CHECK-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 -9 + ; CHECK-NEXT: %sub:_(s64) = nsw G_ADD %x, [[VSCALE]] + ; CHECK-NEXT: $x0 = COPY %sub(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %x:_(s64) = COPY $x0 + %rhs:_(s64) = G_VSCALE i64 9 + %sub:_(s64) = nsw G_SUB %x(s64), %rhs(s64) + $x0 = COPY %sub(s64) + RET_ReallyLR implicit $x0 +... +--- +name: shl_of_vscale +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shl_of_vscale + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %shl:_(s64) = G_VSCALE i64 44 + ; CHECK-NEXT: $x0 = COPY %shl(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %rhs:_(s64) = G_CONSTANT i64 2 + %lhs:_(s64) = G_VSCALE i64 11 + %shl:_(s64) = nsw G_SHL %lhs(s64), %rhs(s64) + $x0 = COPY %shl(s64) + RET_ReallyLR implicit $x0 +... +--- +name: shl_of_vscale_wrong_flag +body: | + bb.1: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shl_of_vscale_wrong_flag + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %rhs:_(s64) = G_CONSTANT i64 2 + ; CHECK-NEXT: %lhs:_(s64) = G_VSCALE i64 11 + ; CHECK-NEXT: %shl:_(s64) = nuw G_SHL %lhs, %rhs(s64) + ; CHECK-NEXT: $x0 = COPY %shl(s64) + ; CHECK-NEXT: RET_ReallyLR implicit $x0 + %rhs:_(s64) = G_CONSTANT i64 2 + %lhs:_(s64) = G_VSCALE i64 11 + %shl:_(s64) = nuw G_SHL %lhs(s64), %rhs(s64) + $x0 = COPY %shl(s64) + RET_ReallyLR implicit $x0 diff --git a/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll b/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll index 3b6c4fa..dafdcf8 100644 --- a/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll +++ b/llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll @@ -12,7 +12,7 @@ entry: for.body: ; CHECK: for.body -; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}] +; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}] ; CHECK: add x[[REG:[0-9]+]], ; CHECK: x[[REG]], #1, lsl #12 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] diff --git a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll index 8c7b31f..114203e 100644 --- a/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ b/llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -176,13 +176,13 @@ exit: ; CHECK: ********** MI Scheduling ********** ; CHECK: LDURDi_LDRDui:%bb.1 vector_body ; -; CHECK: Cluster ld/st SU(2) - SU(6) -; CHECK: Cluster ld/st SU(3) - SU(7) +; CHECK: Cluster ld/st SU(0) - SU(4) +; CHECK: Cluster ld/st SU(1) - SU(5) ; -; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi -; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi -; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui -; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui +; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi +; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi +; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui +; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui ; define void @LDURDi_LDRDui(ptr nocapture readonly %arg) { entry: diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll index ac2b21a..2ef3528 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll @@ -15,36 +15,34 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: mov w9, #100 // =0x64 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: rdvl x11, #2 +; CHECK-NEXT: mov w8, #100 // =0x64 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: mov x11, x9 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: zip2 p2.d, p1.d, p1.d -; CHECK-NEXT: add x13, x0, x8 -; CHECK-NEXT: add x14, x1, x8 -; CHECK-NEXT: zip1 p1.d, p1.d, p1.d ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl] -; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14] +; CHECK-NEXT: zip1 p1.d, p1.d, p1.d +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] +; CHECK-NEXT: add x1, x1, x10 +; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 ; CHECK-NEXT: mov z0.d, p2/m, z7.d ; CHECK-NEXT: mov z1.d, p1/m, z6.d -; CHECK-NEXT: whilelo p1.d, x12, x9 -; CHECK-NEXT: add x12, x12, x10 +; CHECK-NEXT: whilelo p1.d, x11, x8 +; CHECK-NEXT: add x11, x11, x9 ; CHECK-NEXT: b.mi .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -114,39 +112,37 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr % ; CHECK-LABEL: complex_mul_predicated_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: cntd x10 -; CHECK-NEXT: mov w12, #100 // =0x64 -; CHECK-NEXT: neg x11, x10 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: neg x10, x9 ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: and x11, x11, x12 -; CHECK-NEXT: rdvl x12, #2 +; CHECK-NEXT: and x10, x10, x11 +; CHECK-NEXT: rdvl x11, #2 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x9, lsl #2] -; CHECK-NEXT: add x13, x0, x8 -; CHECK-NEXT: add x14, x1, x8 +; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2] ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: add x9, x9, x10 -; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0 -; CHECK-NEXT: cmp x11, x9 -; CHECK-NEXT: zip2 p1.d, p2.d, p2.d -; CHECK-NEXT: zip1 p2.d, p2.d, p2.d -; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: add x8, x8, x9 +; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0 +; CHECK-NEXT: cmp x10, x8 +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p1.d, p1.d, p1.d +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] +; CHECK-NEXT: add x1, x1, x11 +; CHECK-NEXT: add x0, x0, x11 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p1/m, z7.d -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -218,38 +214,38 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt ; CHECK-LABEL: complex_mul_predicated_x2_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: mov w10, #100 // =0x64 +; CHECK-NEXT: mov w8, #100 // =0x64 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: rdvl x10, #2 +; CHECK-NEXT: cnth x11 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: whilelo p1.d, xzr, x10 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: cntd x11 -; CHECK-NEXT: rdvl x12, #2 +; CHECK-NEXT: mov x12, x9 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x9, lsl #2] -; CHECK-NEXT: add x13, x0, x8 -; CHECK-NEXT: add x14, x1, x8 +; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2] ; CHECK-NEXT: mov z6.d, z1.d ; CHECK-NEXT: mov z7.d, z0.d -; CHECK-NEXT: add x9, x9, x11 -; CHECK-NEXT: add x8, x8, x12 -; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0 -; CHECK-NEXT: zip2 p1.d, p2.d, p2.d -; CHECK-NEXT: zip1 p2.d, p2.d, p2.d -; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl] -; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13] -; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14] +; CHECK-NEXT: add x2, x2, x11 +; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0 +; CHECK-NEXT: zip2 p2.d, p1.d, p1.d +; CHECK-NEXT: zip1 p1.d, p1.d, p1.d +; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0] +; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1] +; CHECK-NEXT: add x1, x1, x10 +; CHECK-NEXT: add x0, x0, x10 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0 ; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90 -; CHECK-NEXT: mov z0.d, p1/m, z7.d -; CHECK-NEXT: whilelo p1.d, x9, x10 -; CHECK-NEXT: mov z1.d, p2/m, z6.d +; CHECK-NEXT: mov z0.d, p2/m, z7.d +; CHECK-NEXT: mov z1.d, p1/m, z6.d +; CHECK-NEXT: whilelo p1.d, x12, x8 +; CHECK-NEXT: add x12, x12, x9 ; CHECK-NEXT: b.mi .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll index af07519..8e26ef6 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll @@ -15,30 +15,27 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: cntd x9 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: neg x10, x9 -; CHECK-NEXT: mov w11, #100 // =0x64 +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: mov w10, #100 // =0x64 +; CHECK-NEXT: neg x9, x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: and x10, x10, x11 -; CHECK-NEXT: rdvl x11, #2 +; CHECK-NEXT: and x9, x9, x10 +; CHECK-NEXT: rdvl x10, #2 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d ; CHECK-NEXT: .LBB0_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x12, x0, x8 -; CHECK-NEXT: add x13, x1, x8 -; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] -; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 -; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: subs x9, x9, x8 +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: add x1, x1, x10 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB0_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -103,34 +100,31 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) { ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov d0, #1.00000000 ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: cntd x9 +; CHECK-NEXT: cntd x8 ; CHECK-NEXT: fmov d2, #2.00000000 ; CHECK-NEXT: ptrue p0.d, vl1 -; CHECK-NEXT: neg x10, x9 -; CHECK-NEXT: ptrue p1.b -; CHECK-NEXT: mov w11, #100 // =0x64 -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: neg x9, x8 +; CHECK-NEXT: mov w10, #100 // =0x64 ; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d -; CHECK-NEXT: and x10, x10, x11 -; CHECK-NEXT: rdvl x11, #2 +; CHECK-NEXT: and x9, x9, x10 +; CHECK-NEXT: rdvl x10, #2 ; CHECK-NEXT: mov z1.d, p0/m, z2.d ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: zip2 z0.d, z1.d, z3.d ; CHECK-NEXT: zip1 z1.d, z1.d, z3.d ; CHECK-NEXT: .LBB1_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x12, x0, x8 -; CHECK-NEXT: add x13, x1, x8 -; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl] -; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 -; CHECK-NEXT: add x8, x8, x11 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0] +; CHECK-NEXT: subs x9, x9, x8 +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1] +; CHECK-NEXT: add x1, x1, x10 +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90 ; CHECK-NEXT: b.ne .LBB1_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d @@ -190,45 +184,37 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-LABEL: complex_mul_v2f64_unrolled: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov z1.d, #0 // =0x0 -; CHECK-NEXT: cntw x9 -; CHECK-NEXT: mov w11, #1000 // =0x3e8 -; CHECK-NEXT: neg x10, x9 -; CHECK-NEXT: rdvl x12, #2 -; CHECK-NEXT: ptrue p1.b +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: mov w10, #1000 // =0x3e8 +; CHECK-NEXT: neg x9, x8 ; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: and x10, x10, x11 +; CHECK-NEXT: and x9, x9, x10 +; CHECK-NEXT: rdvl x10, #4 ; CHECK-NEXT: zip2 z0.d, z1.d, z1.d ; CHECK-NEXT: zip1 z1.d, z1.d, z1.d -; CHECK-NEXT: add x11, x1, x12 -; CHECK-NEXT: add x12, x0, x12 -; CHECK-NEXT: rdvl x13, #4 ; CHECK-NEXT: mov z2.d, z1.d ; CHECK-NEXT: mov z3.d, z0.d ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x14, x0, x8 -; CHECK-NEXT: add x15, x12, x8 -; CHECK-NEXT: add x16, x1, x8 -; CHECK-NEXT: add x17, x11, x8 -; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl] -; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8] -; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8] -; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl] -; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl] -; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8] -; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl] -; CHECK-NEXT: subs x10, x10, x9 -; CHECK-NEXT: add x8, x8, x13 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0 -; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0 -; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0 -; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0 -; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90 -; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90 -; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90 -; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90 +; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0] +; CHECK-NEXT: subs x9, x9, x8 +; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl] +; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: add x0, x0, x10 +; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl] +; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl] +; CHECK-NEXT: add x1, x1, x10 +; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0 +; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0 +; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0 +; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90 +; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90 +; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90 +; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90 ; CHECK-NEXT: b.ne .LBB2_1 ; CHECK-NEXT: // %bb.2: // %exit.block ; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll index 44d0a93..aed3072 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions.ll @@ -148,17 +148,16 @@ define %"struct.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) { ; CHECK-NEXT: adrp x8, .LCPI2_0 ; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: ldr q2, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: add x8, x0, #32 +; CHECK-NEXT: add x9, x1, #32 +; CHECK-NEXT: mov x10, #-100 // =0xffffffffffffff9c ; CHECK-NEXT: .LBB2_1: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x9, x0, x8 -; CHECK-NEXT: add x10, x1, x8 -; CHECK-NEXT: add x8, x8, #64 -; CHECK-NEXT: ldp q5, q4, [x9] -; CHECK-NEXT: cmp x8, #1600 -; CHECK-NEXT: ldp q7, q6, [x10] -; CHECK-NEXT: ldp q17, q16, [x9, #32] -; CHECK-NEXT: ldp q19, q18, [x10, #32] +; CHECK-NEXT: ldp q5, q4, [x8, #-32] +; CHECK-NEXT: adds x10, x10, #4 +; CHECK-NEXT: ldp q7, q6, [x9, #-32] +; CHECK-NEXT: ldp q17, q16, [x8], #64 +; CHECK-NEXT: ldp q19, q18, [x9], #64 ; CHECK-NEXT: fcmla v2.2d, v7.2d, v5.2d, #0 ; CHECK-NEXT: fcmla v0.2d, v6.2d, v4.2d, #0 ; CHECK-NEXT: fcmla v1.2d, v19.2d, v17.2d, #0 diff --git a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll index 7535638..63c6533 100644 --- a/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll +++ b/llvm/test/CodeGen/AArch64/sme-support-routines-calling-convention.ll @@ -25,6 +25,25 @@ define void @test_sme_calling_convention_x0() nounwind { ret void } +define i64 @test_sme_calling_convention_x1() nounwind { +; CHECK-LABEL: test_sme_calling_convention_x1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: bl __arm_get_current_vg +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; DARWIN-LABEL: test_sme_calling_convention_x1: +; DARWIN: stp x29, x30, [sp, #-16]! +; DARWIN: bl ___arm_get_current_vg +; DARWIN: ldp x29, x30, [sp], #16 +; DARWIN: ret +; +; CHECK-CSRMASK-LABEL: name: test_sme_calling_convention_x1 +; CHECK-CSRMASK: BL @__arm_get_current_vg, csr_aarch64_sme_abi_support_routines_preservemost_from_x1 + %vg = call aarch64_sme_preservemost_from_x1 i64 @__arm_get_current_vg() + ret i64 %vg +} + define i64 @test_sme_calling_convention_x2() nounwind { ; CHECK-LABEL: test_sme_calling_convention_x2: ; CHECK: // %bb.0: @@ -46,4 +65,5 @@ define i64 @test_sme_calling_convention_x2() nounwind { } declare void @__arm_tpidr2_save() +declare i64 @__arm_get_current_vg() declare {i64, i64} @__arm_sme_state() diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 4c02a52..c993051 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: b fmaxl +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: bl __gttf2 +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: b.le .LBB18_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %b = call nnan fp128 @llvm.vector.reduce.fmax.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll index 18d40cb..0116be5 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -648,7 +648,19 @@ define float @test_v3f32_ninf(<3 x float> %a) nounwind { define fp128 @test_v2f128(<2 x fp128> %a) nounwind { ; CHECK-LABEL: test_v2f128: ; CHECK: // %bb.0: -; CHECK-NEXT: b fminl +; CHECK-NEXT: sub sp, sp, #48 +; CHECK-NEXT: str x30, [sp, #32] // 8-byte Folded Spill +; CHECK-NEXT: stp q0, q1, [sp] // 32-byte Folded Spill +; CHECK-NEXT: bl __lttf2 +; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: cmp w0, #0 +; CHECK-NEXT: b.ge .LBB18_2 +; CHECK-NEXT: // %bb.1: +; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #48 +; CHECK-NEXT: ret %b = call nnan fp128 @llvm.vector.reduce.fmin.v2f128(<2 x fp128> %a) ret fp128 %b } diff --git a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll index 599bd81..66bb131 100644 --- a/llvm/test/CodeGen/AArch64/zext-to-tbl.ll +++ b/llvm/test/CodeGen/AArch64/zext-to-tbl.ll @@ -1669,42 +1669,41 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-LABEL: zext_v8i8_to_v8i64_with_add_in_sequence_in_loop: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: Lloh18: -; CHECK-NEXT: adrp x9, lCPI17_0@PAGE +; CHECK-NEXT: adrp x8, lCPI17_0@PAGE ; CHECK-NEXT: Lloh19: -; CHECK-NEXT: adrp x10, lCPI17_1@PAGE -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: adrp x9, lCPI17_1@PAGE +; CHECK-NEXT: mov w10, #128 ; =0x80 ; CHECK-NEXT: Lloh20: -; CHECK-NEXT: ldr q0, [x9, lCPI17_0@PAGEOFF] +; CHECK-NEXT: ldr q0, [x8, lCPI17_0@PAGEOFF] ; CHECK-NEXT: Lloh21: -; CHECK-NEXT: ldr q1, [x10, lCPI17_1@PAGEOFF] +; CHECK-NEXT: ldr q1, [x9, lCPI17_1@PAGEOFF] +; CHECK-NEXT: add x8, x1, #64 ; CHECK-NEXT: add x9, x0, #8 ; CHECK-NEXT: LBB17_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp d2, d3, [x9, #-8] -; CHECK-NEXT: add x10, x1, x8 -; CHECK-NEXT: ldp q6, q5, [x10, #32] -; CHECK-NEXT: add x8, x8, #128 -; CHECK-NEXT: ldp q17, q16, [x10] -; CHECK-NEXT: cmp x8, #1024 +; CHECK-NEXT: subs x10, x10, #16 +; CHECK-NEXT: ldp q6, q5, [x8, #-32] +; CHECK-NEXT: add x9, x9, #16 +; CHECK-NEXT: ldp q17, q16, [x8, #-64] ; CHECK-NEXT: tbl.16b v4, { v2 }, v1 ; CHECK-NEXT: tbl.16b v2, { v2 }, v0 ; CHECK-NEXT: tbl.16b v7, { v3 }, v1 ; CHECK-NEXT: tbl.16b v3, { v3 }, v0 -; CHECK-NEXT: add x9, x9, #16 ; CHECK-NEXT: uaddw2.2d v5, v5, v4 ; CHECK-NEXT: uaddw.2d v4, v6, v4 ; CHECK-NEXT: uaddw2.2d v6, v16, v2 -; CHECK-NEXT: ldp q18, q16, [x10, #96] +; CHECK-NEXT: ldp q18, q16, [x8, #32] ; CHECK-NEXT: uaddw.2d v2, v17, v2 -; CHECK-NEXT: stp q4, q5, [x10, #32] +; CHECK-NEXT: stp q4, q5, [x8, #-32] ; CHECK-NEXT: uaddw2.2d v5, v16, v7 -; CHECK-NEXT: ldp q16, q4, [x10, #64] +; CHECK-NEXT: ldp q16, q4, [x8] ; CHECK-NEXT: uaddw.2d v7, v18, v7 -; CHECK-NEXT: stp q2, q6, [x10] +; CHECK-NEXT: stp q2, q6, [x8, #-64] ; CHECK-NEXT: uaddw2.2d v4, v4, v3 ; CHECK-NEXT: uaddw.2d v2, v16, v3 -; CHECK-NEXT: stp q7, q5, [x10, #96] -; CHECK-NEXT: stp q2, q4, [x10, #64] +; CHECK-NEXT: stp q7, q5, [x8, #32] +; CHECK-NEXT: stp q2, q4, [x8], #128 ; CHECK-NEXT: b.ne LBB17_1 ; CHECK-NEXT: ; %bb.2: ; %exit ; CHECK-NEXT: ret @@ -1715,67 +1714,67 @@ define void @zext_v8i8_to_v8i64_with_add_in_sequence_in_loop(ptr %src, ptr %dst) ; CHECK-BE: // %bb.0: // %entry ; CHECK-BE-NEXT: adrp x9, .LCPI17_0 ; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI17_0 -; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: mov w8, #128 // =0x80 ; CHECK-BE-NEXT: ld1 { v0.16b }, [x9] ; CHECK-BE-NEXT: adrp x9, .LCPI17_1 ; CHECK-BE-NEXT: add x9, x9, :lo12:.LCPI17_1 ; CHECK-BE-NEXT: ld1 { v1.16b }, [x9] -; CHECK-BE-NEXT: add x9, x0, #8 +; CHECK-BE-NEXT: add x9, x1, #64 +; CHECK-BE-NEXT: add x10, x0, #8 ; CHECK-BE-NEXT: .LBB17_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: sub x10, x9, #8 -; CHECK-BE-NEXT: ld1 { v2.8b }, [x9] -; CHECK-BE-NEXT: add x9, x9, #16 -; CHECK-BE-NEXT: ld1 { v3.8b }, [x10] -; CHECK-BE-NEXT: add x10, x1, x8 -; CHECK-BE-NEXT: add x8, x8, #128 -; CHECK-BE-NEXT: add x11, x10, #32 -; CHECK-BE-NEXT: add x14, x10, #64 -; CHECK-BE-NEXT: add x15, x10, #96 +; CHECK-BE-NEXT: ld1 { v2.8b }, [x10] +; CHECK-BE-NEXT: sub x11, x10, #8 +; CHECK-BE-NEXT: add x15, x9, #32 +; CHECK-BE-NEXT: ld1 { v3.8b }, [x11] +; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] +; CHECK-BE-NEXT: sub x11, x9, #64 +; CHECK-BE-NEXT: sub x12, x9, #32 +; CHECK-BE-NEXT: ld1 { v6.2d }, [x9] +; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] ; CHECK-BE-NEXT: tbl v4.16b, { v2.16b }, v1.16b ; CHECK-BE-NEXT: tbl v2.16b, { v2.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v5.2d }, [x10] -; CHECK-BE-NEXT: tbl v6.16b, { v3.16b }, v1.16b +; CHECK-BE-NEXT: ld1 { v19.2d }, [x12] +; CHECK-BE-NEXT: tbl v5.16b, { v3.16b }, v1.16b ; CHECK-BE-NEXT: tbl v3.16b, { v3.16b }, v0.16b -; CHECK-BE-NEXT: ld1 { v16.2d }, [x15] -; CHECK-BE-NEXT: ld1 { v19.2d }, [x14] -; CHECK-BE-NEXT: ld1 { v21.2d }, [x11] -; CHECK-BE-NEXT: add x12, x10, #48 -; CHECK-BE-NEXT: add x13, x10, #16 -; CHECK-BE-NEXT: add x16, x10, #112 -; CHECK-BE-NEXT: add x17, x10, #80 +; CHECK-BE-NEXT: sub x13, x9, #16 +; CHECK-BE-NEXT: sub x14, x9, #48 +; CHECK-BE-NEXT: add x16, x9, #48 +; CHECK-BE-NEXT: add x17, x9, #16 +; CHECK-BE-NEXT: ld1 { v22.2d }, [x13] +; CHECK-BE-NEXT: subs x8, x8, #16 +; CHECK-BE-NEXT: add x10, x10, #16 ; CHECK-BE-NEXT: rev32 v7.8b, v4.8b ; CHECK-BE-NEXT: ext v4.16b, v4.16b, v4.16b, #8 ; CHECK-BE-NEXT: rev32 v17.8b, v2.8b -; CHECK-BE-NEXT: ext v18.16b, v6.16b, v6.16b, #8 +; CHECK-BE-NEXT: ext v18.16b, v5.16b, v5.16b, #8 ; CHECK-BE-NEXT: ext v20.16b, v3.16b, v3.16b, #8 ; CHECK-BE-NEXT: ext v2.16b, v2.16b, v2.16b, #8 -; CHECK-BE-NEXT: rev32 v6.8b, v6.8b +; CHECK-BE-NEXT: rev32 v5.8b, v5.8b ; CHECK-BE-NEXT: rev32 v3.8b, v3.8b -; CHECK-BE-NEXT: ld1 { v22.2d }, [x12] -; CHECK-BE-NEXT: cmp x8, #1024 -; CHECK-BE-NEXT: rev32 v4.8b, v4.8b ; CHECK-BE-NEXT: uaddw v7.2d, v16.2d, v7.2s -; CHECK-BE-NEXT: ld1 { v16.2d }, [x16] -; CHECK-BE-NEXT: rev32 v18.8b, v18.8b +; CHECK-BE-NEXT: rev32 v4.8b, v4.8b +; CHECK-BE-NEXT: uaddw v6.2d, v6.2d, v17.2s +; CHECK-BE-NEXT: rev32 v17.8b, v18.8b ; CHECK-BE-NEXT: rev32 v20.8b, v20.8b ; CHECK-BE-NEXT: rev32 v2.8b, v2.8b -; CHECK-BE-NEXT: uaddw v17.2d, v19.2d, v17.2s -; CHECK-BE-NEXT: ld1 { v19.2d }, [x13] -; CHECK-BE-NEXT: uaddw v6.2d, v21.2d, v6.2s -; CHECK-BE-NEXT: uaddw v3.2d, v5.2d, v3.2s -; CHECK-BE-NEXT: ld1 { v5.2d }, [x17] +; CHECK-BE-NEXT: ld1 { v16.2d }, [x16] +; CHECK-BE-NEXT: ld1 { v18.2d }, [x14] +; CHECK-BE-NEXT: uaddw v5.2d, v19.2d, v5.2s +; CHECK-BE-NEXT: uaddw v3.2d, v21.2d, v3.2s ; CHECK-BE-NEXT: st1 { v7.2d }, [x15] +; CHECK-BE-NEXT: ld1 { v7.2d }, [x17] +; CHECK-BE-NEXT: st1 { v6.2d }, [x9] +; CHECK-BE-NEXT: add x9, x9, #128 ; CHECK-BE-NEXT: uaddw v4.2d, v16.2d, v4.2s -; CHECK-BE-NEXT: st1 { v6.2d }, [x11] -; CHECK-BE-NEXT: uaddw v6.2d, v22.2d, v18.2s -; CHECK-BE-NEXT: st1 { v3.2d }, [x10] -; CHECK-BE-NEXT: uaddw v3.2d, v19.2d, v20.2s -; CHECK-BE-NEXT: uaddw v2.2d, v5.2d, v2.2s -; CHECK-BE-NEXT: st1 { v17.2d }, [x14] +; CHECK-BE-NEXT: st1 { v5.2d }, [x12] +; CHECK-BE-NEXT: uaddw v5.2d, v22.2d, v17.2s +; CHECK-BE-NEXT: st1 { v3.2d }, [x11] +; CHECK-BE-NEXT: uaddw v3.2d, v18.2d, v20.2s +; CHECK-BE-NEXT: uaddw v2.2d, v7.2d, v2.2s ; CHECK-BE-NEXT: st1 { v4.2d }, [x16] -; CHECK-BE-NEXT: st1 { v6.2d }, [x12] -; CHECK-BE-NEXT: st1 { v3.2d }, [x13] +; CHECK-BE-NEXT: st1 { v5.2d }, [x13] +; CHECK-BE-NEXT: st1 { v3.2d }, [x14] ; CHECK-BE-NEXT: st1 { v2.2d }, [x17] ; CHECK-BE-NEXT: b.ne .LBB17_1 ; CHECK-BE-NEXT: // %bb.2: // %exit @@ -1813,14 +1812,14 @@ exit: define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) { ; CHECK-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: mov w8, #128 ; =0x80 ; CHECK-NEXT: add x9, x1, #128 +; CHECK-NEXT: add x10, x0, #16 ; CHECK-NEXT: LBB18_1: ; %loop ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add x10, x0, x8 -; CHECK-NEXT: add x8, x8, #16 -; CHECK-NEXT: ldp q0, q1, [x10] -; CHECK-NEXT: cmp x8, #128 +; CHECK-NEXT: ldp q0, q1, [x10, #-16] +; CHECK-NEXT: subs x8, x8, #16 +; CHECK-NEXT: add x10, x10, #16 ; CHECK-NEXT: ushll2.8h v2, v0, #0 ; CHECK-NEXT: ushll.8h v0, v0, #0 ; CHECK-NEXT: ushll2.8h v6, v1, #0 @@ -1863,18 +1862,18 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) { ; ; CHECK-BE-LABEL: zext_v16i8_to_v16i64_in_sequence_in_loop: ; CHECK-BE: // %bb.0: // %entry -; CHECK-BE-NEXT: mov x8, xzr +; CHECK-BE-NEXT: mov w8, #128 // =0x80 ; CHECK-BE-NEXT: add x9, x1, #128 +; CHECK-BE-NEXT: add x10, x0, #16 ; CHECK-BE-NEXT: .LBB18_1: // %loop ; CHECK-BE-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-BE-NEXT: add x10, x0, x8 -; CHECK-BE-NEXT: sub x11, x9, #32 -; CHECK-BE-NEXT: add x8, x8, #16 -; CHECK-BE-NEXT: ld1 { v0.16b }, [x10] -; CHECK-BE-NEXT: add x10, x10, #16 -; CHECK-BE-NEXT: cmp x8, #128 +; CHECK-BE-NEXT: sub x11, x10, #16 ; CHECK-BE-NEXT: ld1 { v5.16b }, [x10] -; CHECK-BE-NEXT: sub x10, x9, #16 +; CHECK-BE-NEXT: sub x12, x9, #32 +; CHECK-BE-NEXT: ld1 { v0.16b }, [x11] +; CHECK-BE-NEXT: sub x11, x9, #16 +; CHECK-BE-NEXT: subs x8, x8, #16 +; CHECK-BE-NEXT: add x10, x10, #16 ; CHECK-BE-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-BE-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-BE-NEXT: ushll2 v2.4s, v1.8h, #0 @@ -1885,54 +1884,54 @@ define void @zext_v16i8_to_v16i64_in_sequence_in_loop(ptr %src, ptr %dst) { ; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-BE-NEXT: ushll2 v6.2d, v1.4s, #0 ; CHECK-BE-NEXT: ushll v1.2d, v1.2s, #0 -; CHECK-BE-NEXT: st1 { v4.2d }, [x10] +; CHECK-BE-NEXT: st1 { v4.2d }, [x11] ; CHECK-BE-NEXT: ushll2 v4.2d, v3.4s, #0 ; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-BE-NEXT: st1 { v2.2d }, [x11] +; CHECK-BE-NEXT: st1 { v2.2d }, [x12] ; CHECK-BE-NEXT: ushll2 v2.8h, v5.16b, #0 -; CHECK-BE-NEXT: sub x11, x9, #80 -; CHECK-BE-NEXT: sub x10, x9, #48 -; CHECK-BE-NEXT: st1 { v4.2d }, [x11] +; CHECK-BE-NEXT: sub x12, x9, #80 +; CHECK-BE-NEXT: sub x11, x9, #48 +; CHECK-BE-NEXT: st1 { v4.2d }, [x12] ; CHECK-BE-NEXT: ushll v4.8h, v5.8b, #0 -; CHECK-BE-NEXT: sub x11, x9, #64 +; CHECK-BE-NEXT: sub x12, x9, #64 ; CHECK-BE-NEXT: ushll2 v5.4s, v2.8h, #0 -; CHECK-BE-NEXT: st1 { v1.2d }, [x11] -; CHECK-BE-NEXT: sub x11, x9, #96 +; CHECK-BE-NEXT: st1 { v1.2d }, [x12] +; CHECK-BE-NEXT: sub x12, x9, #96 ; CHECK-BE-NEXT: ushll2 v1.2d, v0.4s, #0 ; CHECK-BE-NEXT: ushll v2.4s, v2.4h, #0 ; CHECK-BE-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-BE-NEXT: st1 { v6.2d }, [x10] -; CHECK-BE-NEXT: sub x10, x9, #128 -; CHECK-BE-NEXT: st1 { v3.2d }, [x11] +; CHECK-BE-NEXT: st1 { v6.2d }, [x11] +; CHECK-BE-NEXT: sub x11, x9, #128 +; CHECK-BE-NEXT: st1 { v3.2d }, [x12] ; CHECK-BE-NEXT: ushll2 v3.4s, v4.8h, #0 ; CHECK-BE-NEXT: ushll2 v6.2d, v5.4s, #0 -; CHECK-BE-NEXT: sub x11, x9, #112 +; CHECK-BE-NEXT: sub x12, x9, #112 ; CHECK-BE-NEXT: ushll v5.2d, v5.2s, #0 -; CHECK-BE-NEXT: st1 { v0.2d }, [x10] -; CHECK-BE-NEXT: st1 { v1.2d }, [x11] +; CHECK-BE-NEXT: st1 { v0.2d }, [x11] +; CHECK-BE-NEXT: st1 { v1.2d }, [x12] ; CHECK-BE-NEXT: ushll2 v1.2d, v2.4s, #0 -; CHECK-BE-NEXT: add x10, x9, #112 +; CHECK-BE-NEXT: add x11, x9, #112 ; CHECK-BE-NEXT: ushll v4.4s, v4.4h, #0 ; CHECK-BE-NEXT: ushll2 v0.2d, v3.4s, #0 -; CHECK-BE-NEXT: st1 { v6.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #96 +; CHECK-BE-NEXT: st1 { v6.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #96 ; CHECK-BE-NEXT: ushll v2.2d, v2.2s, #0 ; CHECK-BE-NEXT: ushll v3.2d, v3.2s, #0 -; CHECK-BE-NEXT: st1 { v5.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #80 -; CHECK-BE-NEXT: st1 { v1.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #48 +; CHECK-BE-NEXT: st1 { v5.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #80 +; CHECK-BE-NEXT: st1 { v1.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #48 ; CHECK-BE-NEXT: ushll2 v1.2d, v4.4s, #0 -; CHECK-BE-NEXT: st1 { v0.2d }, [x10] +; CHECK-BE-NEXT: st1 { v0.2d }, [x11] ; CHECK-BE-NEXT: ushll v0.2d, v4.2s, #0 -; CHECK-BE-NEXT: add x10, x9, #64 -; CHECK-BE-NEXT: st1 { v2.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #32 -; CHECK-BE-NEXT: st1 { v3.2d }, [x10] -; CHECK-BE-NEXT: add x10, x9, #16 +; CHECK-BE-NEXT: add x11, x9, #64 +; CHECK-BE-NEXT: st1 { v2.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #32 +; CHECK-BE-NEXT: st1 { v3.2d }, [x11] +; CHECK-BE-NEXT: add x11, x9, #16 ; CHECK-BE-NEXT: st1 { v0.2d }, [x9] ; CHECK-BE-NEXT: add x9, x9, #128 -; CHECK-BE-NEXT: st1 { v1.2d }, [x10] +; CHECK-BE-NEXT: st1 { v1.2d }, [x11] ; CHECK-BE-NEXT: b.ne .LBB18_1 ; CHECK-BE-NEXT: // %bb.2: // %exit ; CHECK-BE-NEXT: ret diff --git a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll index c347437..40d77a7 100644 --- a/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll +++ b/llvm/test/CodeGen/AMDGPU/directive-amdgcn-target.ll @@ -105,6 +105,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1103 < %s | FileCheck --check-prefixes=GFX1103 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1150 < %s | FileCheck --check-prefixes=GFX1150 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 < %s | FileCheck --check-prefixes=GFX1151 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1152 < %s | FileCheck --check-prefixes=GFX1152 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX1200 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1201 < %s | FileCheck --check-prefixes=GFX1201 %s @@ -201,6 +202,7 @@ ; GFX1103: .amdgcn_target "amdgcn-amd-amdhsa--gfx1103" ; GFX1150: .amdgcn_target "amdgcn-amd-amdhsa--gfx1150" ; GFX1151: .amdgcn_target "amdgcn-amd-amdhsa--gfx1151" +; GFX1152: .amdgcn_target "amdgcn-amd-amdhsa--gfx1152" ; GFX1200: .amdgcn_target "amdgcn-amd-amdhsa--gfx1200" ; GFX1201: .amdgcn_target "amdgcn-amd-amdhsa--gfx1201" diff --git a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll index edc2015..560a05a 100644 --- a/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll +++ b/llvm/test/CodeGen/AMDGPU/elf-header-flags-mach.ll @@ -74,6 +74,7 @@ ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1103 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1103 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1150 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1150 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1151 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1151 %s +; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1152 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1152 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1200 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1200 %s ; RUN: llc -filetype=obj -mtriple=amdgcn -mcpu=gfx1201 < %s | llvm-readobj --file-header - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1201 %s @@ -153,6 +154,7 @@ ; GFX1103: EF_AMDGPU_MACH_AMDGCN_GFX1103 (0x44) ; GFX1150: EF_AMDGPU_MACH_AMDGCN_GFX1150 (0x43) ; GFX1151: EF_AMDGPU_MACH_AMDGCN_GFX1151 (0x4A) +; GFX1152: EF_AMDGPU_MACH_AMDGCN_GFX1152 (0x55) ; GFX1200: EF_AMDGPU_MACH_AMDGCN_GFX1200 (0x48) ; GFX1201: EF_AMDGPU_MACH_AMDGCN_GFX1201 (0x4E) diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll new file mode 100644 index 0000000..ce55558 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll @@ -0,0 +1,545 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature +; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s +; REQUIRES: amdgpu-registered-target +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9" +target triple = "amdgcn-amd-amdhsa" + +; Check the variables are lowered to the locations this target expects + +; The types show the call frames +; CHECK: %single_i32.vararg = type <{ i32 }> +; CHECK: %single_double.vararg = type <{ double }> +; CHECK: %single_v4f32.vararg = type <{ <4 x float> }> +; CHECK: %single_v8f32.vararg = type <{ <8 x float> }> +; CHECK: %single_v16f32.vararg = type <{ <16 x float> }> +; CHECK: %single_v32f32.vararg = type <{ <32 x float> }> +; CHECK: %i32_double.vararg = type <{ i32, double }> +; CHECK: %double_i32.vararg = type <{ double, i32 }> +; CHECK: %i32_libcS.vararg = type <{ i32, %struct.libcS }> +; CHECK: %libcS_i32.vararg = type <{ %struct.libcS, i32 }> +; CHECK: %i32_v4f32.vararg = type <{ i32, <4 x float> }> +; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }> +; CHECK: %i32_v8f32.vararg = type <{ i32, <8 x float> }> +; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }> +; CHECK: %i32_v16f32.vararg = type <{ i32, <16 x float> }> +; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }> +; CHECK: %i32_v32f32.vararg = type <{ i32, <32 x float> }> +; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }> +; CHECK: %fptr_single_i32.vararg = type <{ i32 }> +; CHECK: %fptr_libcS.vararg = type <{ %struct.libcS }> + +%struct.libcS = type { i8, i16, i32, i64, float, double } + +@vararg_ptr = hidden addrspace(1) global ptr @vararg, align 8 + +define hidden void @copy(ptr noundef %va) { +; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va.addr = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: %cp = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr +; CHECK-NEXT: %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr +; CHECK-NEXT: store ptr %va, ptr addrspace(5) %va.addr, align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr %cp.ascast, ptr %va.addr.ascast, i32 8, i1 false) +; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %cp, align 8 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) +; CHECK-NEXT: ret void +; +entry: + %va.addr = alloca ptr, align 8, addrspace(5) + %cp = alloca ptr, align 8, addrspace(5) + %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr + %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr + store ptr %va, ptr addrspace(5) %va.addr, align 8 + call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) + call void @llvm.va_copy.p0(ptr %cp.ascast, ptr nonnull %va.addr.ascast) + %0 = load ptr, ptr addrspace(5) %cp, align 8 + call void @valist(ptr noundef %0) + call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) + ret void +} + +declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) + +declare void @llvm.va_copy.p0(ptr, ptr) + +declare hidden void @valist(ptr noundef) + +declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) + +define hidden void @start_once(...) { +; CHECK-LABEL: define {{[^@]+}}@start_once(ptr %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %s = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: %s.ascast = addrspacecast ptr addrspace(5) %s to ptr +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) +; CHECK-NEXT: store ptr %varargs, ptr %s.ascast, align 8 +; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %s, align 8 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) +; CHECK-NEXT: ret void +; +entry: + %s = alloca ptr, align 8, addrspace(5) + %s.ascast = addrspacecast ptr addrspace(5) %s to ptr + call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) + call void @llvm.va_start.p0(ptr %s.ascast) + %0 = load ptr, ptr addrspace(5) %s, align 8 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s.ascast) + call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) + ret void +} + +declare void @llvm.va_start.p0(ptr) + +declare void @llvm.va_end.p0(ptr) + +define hidden void @start_twice(...) { +; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %s0 = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: %s1 = alloca ptr, align 8, addrspace(5) +; CHECK-NEXT: %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr +; CHECK-NEXT: %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) +; CHECK-NEXT: store ptr %varargs, ptr %s0.ascast, align 8 +; CHECK-NEXT: %0 = load ptr, ptr addrspace(5) %s0, align 8 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: store ptr %varargs, ptr %s1.ascast, align 8 +; CHECK-NEXT: %1 = load ptr, ptr addrspace(5) %s1, align 8 +; CHECK-NEXT: call void @valist(ptr noundef %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) +; CHECK-NEXT: ret void +; +entry: + %s0 = alloca ptr, align 8, addrspace(5) + %s1 = alloca ptr, align 8, addrspace(5) + %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr + %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr + call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) + call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) + call void @llvm.va_start.p0(ptr %s0.ascast) + %0 = load ptr, ptr addrspace(5) %s0, align 8 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s0.ascast) + call void @llvm.va_start.p0(ptr %s1.ascast) + %1 = load ptr, ptr addrspace(5) %s1, align 8 + call void @valist(ptr noundef %1) + call void @llvm.va_end.p0(ptr %s1.ascast) + call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) + call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) + ret void +} + +define hidden void @single_i32(i32 noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x) + ret void +} + +declare hidden void @vararg(...) + +define hidden void @single_double(double noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_double.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store double %x, ptr addrspace(5) %0, align 8 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(double noundef %x) + ret void +} + +define hidden void @single_v4f32(<4 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v4f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <4 x float> %x, ptr addrspace(5) %0, align 16 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<4 x float> noundef %x) + ret void +} + +define hidden void @single_v8f32(<8 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v8f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <8 x float> %x, ptr addrspace(5) %0, align 32 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<8 x float> noundef %x) + ret void +} + +define hidden void @single_v16f32(<16 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v16f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <16 x float> %x, ptr addrspace(5) %0, align 64 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<16 x float> noundef %x) + ret void +} + +define hidden void @single_v32f32(<32 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v32f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <32 x float> %x, ptr addrspace(5) %0, align 128 +; CHECK-NEXT: %1 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<32 x float> noundef %x) + ret void +} + +define hidden void @i32_double(i32 noundef %x, double noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_double.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store double %y, ptr addrspace(5) %1, align 8 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, double noundef %y) + ret void +} + +define hidden void @double_i32(double noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %double_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store double %x, ptr addrspace(5) %0, align 8 +; CHECK-NEXT: %1 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(double noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) { +; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_libcS.vararg, align 4, addrspace(5) +; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %y.coerce0, 0 +; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %y.coerce1, 1 +; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %y.coerce2, 2 +; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %y.coerce3, 3 +; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %y.coerce4, 4 +; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %y.coerce5, 5 +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %.fca.0.insert = insertvalue %struct.libcS poison, i8 %y.coerce0, 0 + %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %y.coerce1, 1 + %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %y.coerce2, 2 + %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %y.coerce3, 3 + %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %y.coerce4, 4 + %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %y.coerce5, 5 + tail call void (...) @vararg(i32 noundef %x, %struct.libcS %.fca.5.insert) + ret void +} + +define hidden void @libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %libcS_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0 +; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1 +; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2 +; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3 +; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4 +; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5 +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %0, align 8 +; CHECK-NEXT: %1 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0 + %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1 + %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2 + %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3 + %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4 + %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5 + tail call void (...) @vararg(%struct.libcS %.fca.5.insert, i32 noundef %y) + ret void +} + +define hidden void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v4f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store <4 x float> %y, ptr addrspace(5) %1, align 16 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) + ret void +} + +define hidden void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v4f32_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <4 x float> %x, ptr addrspace(5) %0, align 16 +; CHECK-NEXT: %1 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v8f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store <8 x float> %y, ptr addrspace(5) %1, align 32 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) + ret void +} + +define hidden void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v8f32_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <8 x float> %x, ptr addrspace(5) %0, align 32 +; CHECK-NEXT: %1 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v16f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store <16 x float> %y, ptr addrspace(5) %1, align 64 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) + ret void +} + +define hidden void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v16f32_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <16 x float> %x, ptr addrspace(5) %0, align 64 +; CHECK-NEXT: %1 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v32f32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store <32 x float> %y, ptr addrspace(5) %1, align 128 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) + ret void +} + +define hidden void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v32f32_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <32 x float> %x, ptr addrspace(5) %0, align 128 +; CHECK-NEXT: %1 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void @vararg(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @fptr_single_i32(i32 noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@fptr_single_i32(i32 noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %fptr_single_i32.vararg, align 4, addrspace(5) +; CHECK-NEXT: %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8 +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %1 = getelementptr inbounds %fptr_single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr addrspace(5) %1, align 4 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void %0(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8 + tail call void (...) %0(i32 noundef %x) + ret void +} + +define hidden void @fptr_libcS(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5) { +; CHECK-LABEL: define {{[^@]+}}@fptr_libcS(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %fptr_libcS.vararg, align 4, addrspace(5) +; CHECK-NEXT: %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8 +; CHECK-NEXT: %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0 +; CHECK-NEXT: %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1 +; CHECK-NEXT: %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2 +; CHECK-NEXT: %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3 +; CHECK-NEXT: %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4 +; CHECK-NEXT: %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5 +; CHECK-NEXT: call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: %1 = getelementptr inbounds %fptr_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store %struct.libcS %.fca.5.insert, ptr addrspace(5) %1, align 8 +; CHECK-NEXT: %2 = addrspacecast ptr addrspace(5) %vararg_buffer to ptr +; CHECK-NEXT: call void %0(ptr %2) +; CHECK-NEXT: call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8 + %.fca.0.insert = insertvalue %struct.libcS poison, i8 %x.coerce0, 0 + %.fca.1.insert = insertvalue %struct.libcS %.fca.0.insert, i16 %x.coerce1, 1 + %.fca.2.insert = insertvalue %struct.libcS %.fca.1.insert, i32 %x.coerce2, 2 + %.fca.3.insert = insertvalue %struct.libcS %.fca.2.insert, i64 %x.coerce3, 3 + %.fca.4.insert = insertvalue %struct.libcS %.fca.3.insert, float %x.coerce4, 4 + %.fca.5.insert = insertvalue %struct.libcS %.fca.4.insert, double %x.coerce5, 5 + tail call void (...) %0(%struct.libcS %.fca.5.insert) + ret void +} + + diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll index 3ec36f0..9ce1ba3 100644 --- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll @@ -497,47 +497,19 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-LABEL: v_fmaximum3_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0) @@ -559,47 +531,19 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-LABEL: v_fmaximum3_v2f32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b) %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c) @@ -621,47 +565,19 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, |v6|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v1|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, |v1|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v3|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v0|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, |v0|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v2|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v2|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, |v4| -; GFX9-NEXT: v_cndmask_b32_e64 v2, |v4|, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v4|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v1, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, |v5|, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_max_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v5|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b) @@ -686,47 +602,19 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, -v6, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v1, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, -v1, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v3, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v3, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, -v0, -v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v0, -v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, -v3, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v2, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v2, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, -v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v4, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v1, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, -v5, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_max_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v5, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v5, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a %b.fneg = fneg <2 x float> %b @@ -751,35 +639,19 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, 2.0, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, 2.0, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>) %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c) @@ -801,33 +673,17 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 4.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 4.0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 4.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 4.0, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -852,67 +708,25 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-LABEL: v_fmaximum3_v3f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0) @@ -935,67 +749,25 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-LABEL: v_fmaximum3_v3f32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b) %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c) @@ -1018,67 +790,25 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, |v9|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v2|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, |v2|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v5|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v1|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, |v1|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v4|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, |v0|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v0|, |v3| -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v0|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, |v0|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v3|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, |v6| -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v6|, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_max_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_max_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v6|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v6|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v1, |v7| -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v7|, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v7|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v7|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v2, |v8| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v8|, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v8|, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, |v8|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b) @@ -1104,67 +834,25 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, -v9, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v2, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, -v2, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v5, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -v5, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, -v5, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v1, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v4, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 vcc, -v0, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v0, -v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, -v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v3, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v3, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v0, -v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v6, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_max_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_max_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v6, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v6, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v1, -v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v7, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v7, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v7, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_gt_f32_e64 s[4:5], v2, -v8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v8, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_max_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v8, 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -v8, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a %b.fneg = fneg <3 x float> %b @@ -1190,49 +878,25 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 2.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>) %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c) @@ -1255,47 +919,23 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 4.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 4.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, 4.0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll index 0e0b73b..21074d5 100644 --- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll +++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll @@ -497,47 +497,19 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float ; GFX9-LABEL: v_fminimum3_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v5, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0) @@ -559,47 +531,19 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2 ; GFX9-LABEL: v_fminimum3_v2f32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v0, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b) %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c) @@ -621,47 +565,19 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b, ; GFX9-LABEL: v_fminimum3_v2f32__fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e64 v6, |v1|, |v3| ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v1|, |v3| -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, |v6|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v1|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, |v1|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v3|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v0|, |v2| -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v0|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, |v0|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v2|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v2|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, |v4| -; GFX9-NEXT: v_cndmask_b32_e64 v2, |v4|, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v3| +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e64 v3, |v0|, |v2| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v2| +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v2, v0, |v4| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v4|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, |v5|, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_min_f32_e64 v2, v1, |v5| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v5|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a) %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b) @@ -686,47 +602,19 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b, ; GFX9-LABEL: v_fminimum3_v2f32__fneg_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e64 v6, -v1, -v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v1, -v3 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v7, -v6, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v1, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v6, -v1, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v3, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v3, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v6, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, -v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v0, -v2 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, -v3, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, -v0, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v2, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v2, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, -v4 -; GFX9-NEXT: v_cndmask_b32_e64 v2, -v4, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e64 v3, -v0, -v2 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v2, v0, -v4 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v4, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, -v5, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v2, vcc +; GFX9-NEXT: v_min_f32_e64 v2, v1, -v5 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v5, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v5, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <2 x float> %a %b.fneg = fneg <2 x float> %b @@ -751,35 +639,19 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c ; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, 2.0, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v1 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, 2.0, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v4, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>) %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c) @@ -801,33 +673,17 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b ; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v1, v3 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 4.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 4.0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 4.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v2, 4.0, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v2, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -852,67 +708,25 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float ; GFX9-LABEL: v_fminimum3_v3f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v6, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v7, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v8, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0) @@ -935,67 +749,25 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3 ; GFX9-LABEL: v_fminimum3_v3f32_commute: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v9, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v10, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v0, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v1, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v2, v8 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b) %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c) @@ -1018,67 +790,25 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b, ; GFX9-LABEL: v_fminimum3_v3f32__fabs_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e64 v9, |v2|, |v5| ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v2|, |v5| -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, |v9|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v2|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, |v2|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v5|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v1|, |v4| -; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, |v5|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v1|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, |v1|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v4|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, |v3| -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], |v0|, |v3| -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, |v4|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v0|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, |v0|, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v3|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v3|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, |v6| -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v6|, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v2|, |v5| +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_min_f32_e64 v5, |v1|, |v4| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v1|, |v4| +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_min_f32_e64 v4, |v0|, |v3| +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, |v0|, |v3| +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v0, |v6| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, |v6| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v6|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, |v6|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, |v7| -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v7|, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v1, |v7| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, |v7| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v7|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, |v7|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v2, |v8| -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, |v8|, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v2, |v8| ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, |v8| -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], |v8|, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, |v8|, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a) %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b) @@ -1104,67 +834,25 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b, ; GFX9-LABEL: v_fminimum3_v3f32__fneg_all: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e64 v9, -v2, -v5 ; GFX9-NEXT: v_mov_b32_e32 v10, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v2, -v5 -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, -v9, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v2, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v9, -v2, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v5, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -v5, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v1, -v4 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v10, -v5, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v1, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, -v1, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v4, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v4, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 vcc, -v0, -v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc -; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], -v0, -v3 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, -v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v0, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, -v0, s[4:5] -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v3, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v3, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v0, -v6 -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v6, v0, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v2, -v5 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v9, vcc +; GFX9-NEXT: v_min_f32_e64 v5, -v1, -v4 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v1, -v4 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v5, vcc +; GFX9-NEXT: v_min_f32_e64 v4, -v0, -v3 +; GFX9-NEXT: v_cmp_o_f32_e64 vcc, -v0, -v3 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v4, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v0, -v6 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v0, -v6 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v6, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, -v6, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v1, -v7 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v7, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v0, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v1, -v7 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v1, -v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v7, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, -v7, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cmp_lt_f32_e64 s[4:5], v2, -v8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, -v8, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v10, v3, vcc +; GFX9-NEXT: v_min_f32_e64 v3, v2, -v8 ; GFX9-NEXT: v_cmp_o_f32_e64 vcc, v2, -v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 s[4:5], -v8, 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -v8, s[4:5] -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %a.fneg = fneg <3 x float> %a %b.fneg = fneg <3 x float> %b @@ -1190,49 +878,25 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c ; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v2 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 2.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v6, 2.0, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v6, 2.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>) %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c) @@ -1255,47 +919,23 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b ; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v2, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e32 v5, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 4.0, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 4.0, v1 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v1 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, 4.0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v3, 4.0, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v3, 4.0, v2 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 0db88d1..08cf83f 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -31,6 +31,7 @@ ; GCN-O0-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O0-NEXT: Expand variadic functions ; GCN-O0-NEXT: AMDGPU Inline All Functions ; GCN-O0-NEXT: Inliner for always_inline functions ; GCN-O0-NEXT: FunctionPass Manager @@ -178,6 +179,7 @@ ; GCN-O1-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-NEXT: Expand variadic functions ; GCN-O1-NEXT: AMDGPU Inline All Functions ; GCN-O1-NEXT: Inliner for always_inline functions ; GCN-O1-NEXT: FunctionPass Manager @@ -454,6 +456,7 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Remove Incompatible Functions ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU +; GCN-O1-OPTS-NEXT: Expand variadic functions ; GCN-O1-OPTS-NEXT: AMDGPU Inline All Functions ; GCN-O1-OPTS-NEXT: Inliner for always_inline functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager @@ -760,6 +763,7 @@ ; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O2-NEXT: Expand variadic functions ; GCN-O2-NEXT: AMDGPU Inline All Functions ; GCN-O2-NEXT: Inliner for always_inline functions ; GCN-O2-NEXT: FunctionPass Manager @@ -1070,6 +1074,7 @@ ; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: AMDGPU Image Intrinsic Optimizer +; GCN-O3-NEXT: Expand variadic functions ; GCN-O3-NEXT: AMDGPU Inline All Functions ; GCN-O3-NEXT: Inliner for always_inline functions ; GCN-O3-NEXT: FunctionPass Manager diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 7d7a462..fa7ee9e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -554,28 +554,14 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -654,46 +640,24 @@ define <2 x half> @v_maximum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX7-LABEL: v_maximum_v2f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f16__nnan: @@ -759,13 +723,11 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_max_f16_e32 v4, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX8-NEXT: v_max_f16_e32 v3, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc @@ -847,34 +809,24 @@ define <2 x half> @v_maximum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX7-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f16__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_max_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f16__nnan_nsz: @@ -948,31 +900,15 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s7, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_max_f16_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_gt_f16_e32 vcc, s4, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; GFX8-NEXT: v_max_f16_e32 v3, s4, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 @@ -1216,28 +1152,21 @@ define <3 x half> @v_maximum_v3f16__nnan(<3 x half> %src0, <3 x half> %src1) { ; GFX7-LABEL: v_maximum_v3f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan: @@ -1427,28 +1356,21 @@ define <3 x half> @v_maximum_v3f16__nnan_nsz(<3 x half> %src0, <3 x half> %src1) ; GFX7-LABEL: v_maximum_v3f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 +; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f16__nnan_nsz: @@ -1671,35 +1593,26 @@ define <4 x half> @v_maximum_v4f16__nnan(<4 x half> %src0, <4 x half> %src1) { ; GFX7-LABEL: v_maximum_v4f16__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan: @@ -1924,35 +1837,26 @@ define <4 x half> @v_maximum_v4f16__nnan_nsz(<4 x half> %src0, <4 x half> %src1) ; GFX7-LABEL: v_maximum_v4f16__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 +; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 +; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 +; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6 -; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2 -; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7 -; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3 +; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 +; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 +; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v6 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v7 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f16__nnan_nsz: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll index 7c5bc7d..f4aa40d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll @@ -495,167 +495,73 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX7-LABEL: v_maximum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX8-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v5, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v4, v0, v2 :: v_dual_max_f32 v5, v1, v3 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f32: @@ -676,136 +582,42 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX7-LABEL: v_maximum_v2f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f32__nnan: @@ -826,11 +638,11 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX7-LABEL: v_maximum_v2f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -838,13 +650,11 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-LABEL: v_maximum_v2f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -852,13 +662,11 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX9-LABEL: v_maximum_v2f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -866,16 +674,12 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX940-LABEL: v_maximum_v2f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_max_f32_e32 v2, v1, v3 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -884,11 +688,9 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX10-LABEL: v_maximum_v2f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v5, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo @@ -897,12 +699,9 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX11-LABEL: v_maximum_v2f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v4, v0, v2 :: v_dual_max_f32 v5, v1, v3 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo @@ -926,55 +725,42 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX7-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v2 :: v_dual_max_f32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f32__nnan_nsz: @@ -996,28 +782,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, s5, v0 +; GFX7-NEXT: v_max_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s7, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_max_legacy_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_max_f32_e32 v3, s4, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s6, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] ; GFX7-NEXT: ;;#ASMEND @@ -1027,30 +799,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX8-NEXT: v_max_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v3, s4, v0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND @@ -1060,30 +816,14 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-NEXT: v_max_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND @@ -1093,40 +833,15 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v1, s1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v3, s0, v0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use v[0:1] ; GFX940-NEXT: ;;#ASMEND @@ -1135,28 +850,12 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_maximum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_cmp_class_f32_e64 s8, s5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e64 vcc_lo, s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e64 v0, s5, s7 ; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX10-NEXT: v_max_f32_e64 v2, s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v0, s5, s8 -; GFX10-NEXT: v_cmp_class_f32_e64 s5, s4, 64 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v2, s4, s5 -; GFX10-NEXT: v_cmp_class_f32_e64 s4, s7, 64 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s4 -; GFX10-NEXT: v_cmp_class_f32_e64 s4, s6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s6, s4 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] ; GFX10-NEXT: ;;#ASMEND @@ -1165,32 +864,13 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX11-LABEL: s_maximum_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, s1, s3 -; GFX11-NEXT: v_cmp_class_f32_e64 s4, s1, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e64 vcc_lo, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo +; GFX11-NEXT: v_max_f32_e64 v0, s1, s3 ; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX11-NEXT: v_max_f32_e64 v2, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v0, s1, s4 -; GFX11-NEXT: v_cmp_class_f32_e64 s1, s0, 64 -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v2, s0, s1 -; GFX11-NEXT: v_cmp_class_f32_e64 s0, s3, 64 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX11-NEXT: v_cmp_class_f32_e64 s0, s2, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v[0:1] ; GFX11-NEXT: ;;#ASMEND @@ -1218,227 +898,92 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX7-LABEL: v_maximum_v3f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v2, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX8-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v7, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v8, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v6, v0, v3 :: v_dual_max_f32 v7, v1, v4 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v8, v2, v5 :: v_dual_cndmask_b32 v1, 0x7fc00000, v7 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f32: @@ -1460,184 +1005,48 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX7-LABEL: v_maximum_v3f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f32__nnan: @@ -1659,14 +1068,14 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX7-LABEL: v_maximum_v3f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v2, v5 +; GFX7-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1674,17 +1083,14 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-LABEL: v_maximum_v3f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX8-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1692,17 +1098,14 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX9-LABEL: v_maximum_v3f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1710,22 +1113,16 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX940-LABEL: v_maximum_v3f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_max_f32_e32 v3, v1, v4 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v3, v2, v5 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc @@ -1734,13 +1131,10 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX10-LABEL: v_maximum_v3f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v6, v0, v3 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 +; GFX10-NEXT: v_max_f32_e32 v7, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v8, v2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo @@ -1751,17 +1145,14 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX11-LABEL: v_maximum_v3f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v6, v0, v3 :: v_dual_max_f32 v7, v1, v4 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v8, v2, v5 :: v_dual_cndmask_b32 v1, 0x7fc00000, v7 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,67 +1175,48 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX7-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v3 :: v_dual_max_f32 v1, v1, v4 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f32__nnan_nsz: @@ -1866,292 +1238,111 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX7-LABEL: v_maximum_v4f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 +; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v9, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v8, v0, v4 :: v_dual_max_f32 v9, v1, v5 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX11-NEXT: v_dual_max_f32 v8, v3, v7 :: v_dual_cndmask_b32 v1, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f32: @@ -2174,236 +1365,53 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX7-LABEL: v_maximum_v4f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v7, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v7, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f32__nnan: @@ -2426,17 +1434,17 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX7-LABEL: v_maximum_v4f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v4, v3, v7 +; GFX7-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2444,21 +1452,17 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-LABEL: v_maximum_v4f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX8-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2466,21 +1470,17 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX9-LABEL: v_maximum_v4f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2488,28 +1488,20 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX940-LABEL: v_maximum_v4f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 +; GFX940-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_max_f32_e32 v4, v1, v5 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX940-NEXT: v_max_f32_e32 v4, v3, v7 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc @@ -2518,44 +1510,35 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX10-LABEL: v_maximum_v4f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v0, v4 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v9, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v4, v2, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v7, v3, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v8, v0, v4 :: v_dual_max_f32 v9, v1, v5 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v7, v3, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v4, v2, v6 +; GFX11-NEXT: v_dual_max_f32 v8, v3, v7 :: v_dual_cndmask_b32 v1, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f32__nsz: @@ -2578,79 +1561,53 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX7-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_max_legacy_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_max_legacy_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_max_legacy_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX940-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_max_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_max_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_max_f32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v0, v0, v4 :: v_dual_max_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v6 :: v_dual_max_f32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f32__nnan_nsz: @@ -2673,551 +1630,185 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX7-LABEL: v_maximum_v8f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v0, v8 +; GFX7-NEXT: v_max_f32_e32 v16, v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v1, v9 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v2, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v2, v10 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v3, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v3, v11 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v4, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v4, v12 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v5, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v5, v13 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v6, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v6, v14 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v8, v7, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX7-NEXT: v_max_f32_e32 v8, v7, v15 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc +; GFX8-NEXT: v_max_f32_e32 v16, v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v1, v9 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v2, v10 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v3, v11 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v4, v12 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v5, v13 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v6, v14 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX8-NEXT: v_max_f32_e32 v8, v7, v15 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v8f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc +; GFX9-NEXT: v_max_f32_e32 v16, v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v1, v9 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v2, v10 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v3, v11 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v4, v12 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v5, v13 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v6, v14 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX9-NEXT: v_max_f32_e32 v8, v7, v15 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v8f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v8 +; GFX940-NEXT: v_max_f32_e32 v16, v0, v8 ; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v1, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v2, v10 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v3, v11 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v4, v12 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v5, v13 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v6, v14 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX940-NEXT: v_max_f32_e32 v8, v7, v15 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v16, v0, v8 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v17, v1, v9 +; GFX10-NEXT: v_max_f32_e32 v8, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v16, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v4, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v9, v3, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v17, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v10, v7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v9, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v8, v6, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v10, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v16, v0, v8 :: v_dual_max_f32 v17, v1, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v16, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v12, v4, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v9, v3, v11 :: v_dual_max_f32 v8, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v17, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v10, v7, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v8, v4, v12 :: v_dual_cndmask_b32 v3, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v9, v5, v13 :: v_dual_cndmask_b32 v4, 0x7fc00000, v8 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_max_f32 v8, v6, v14 :: v_dual_cndmask_b32 v5, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v10, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v8f32: @@ -3244,1071 +1835,371 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_maximum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v32, v0, v16 -; GFX7-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v16, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v1, v17 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX7-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v2, v18 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v18, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v3, v19 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v19, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v4, v20 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v20, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v5, v21 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v21, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v6, v22 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v22, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v23, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v24, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v25, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v26, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v27, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v12, v28 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v28, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v29, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v30, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX7-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX7-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX7-NEXT: v_max_f32_e32 v18, v13, v29 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX7-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX7-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX7-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX7-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX7-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_readlane_b32 s31, v31, 1 +; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_max_legacy_f32_e32 v16, v15, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v16, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v2, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v18, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v3, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v19, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v4, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v20, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v5, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v21, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v6, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v22, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v7, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v23, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v8, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v24, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v9, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v25, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v10, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v26, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v11, v27 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v27, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v12, v28 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v28, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v13, v29 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v29, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v30, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX8-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX8-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX8-NEXT: v_max_f32_e32 v18, v13, v29 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 +; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX8-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX8-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX8-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX8-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX8-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_readlane_b32 s31, v31, 1 +; GFX8-NEXT: v_readlane_b32 s30, v31, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_gt_f32_e32 vcc, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v16f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v16, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX9-NEXT: v_writelane_b32 v31, s30, 0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v18, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v19, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v20, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v21, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v22, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v23, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v24, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v25, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v26, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v27, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v12, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v28, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v13, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v29, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v30, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX9-NEXT: v_max_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX9-NEXT: v_max_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX9-NEXT: v_max_f32_e32 v18, v13, v29 +; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX9-NEXT: v_writelane_b32 v31, s31, 1 +; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX9-NEXT: v_max_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX9-NEXT: v_max_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX9-NEXT: v_max_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX9-NEXT: v_max_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX9-NEXT: v_max_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX9-NEXT: v_max_f32_e32 v8, v8, v24 +; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX9-NEXT: v_max_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX9-NEXT: v_max_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX9-NEXT: v_max_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX9-NEXT: v_max_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_max_f32_e32 v19, v14, v30 +; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX9-NEXT: v_readlane_b32 s31, v31, 1 +; GFX9-NEXT: v_readlane_b32 s30, v31, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_max_f32_e32 v18, v15, v16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_f32_e32 vcc, v15, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v16f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v0, v16 ; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v33, v16, v0, vcc +; GFX940-NEXT: v_max_f32_e32 v33, v0, v16 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v16, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v1, v17 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX940-NEXT: v_max_f32_e32 v34, v1, v17 +; GFX940-NEXT: v_max_f32_e32 v35, v2, v18 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v17, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v2, v18 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc +; GFX940-NEXT: v_max_f32_e32 v36, v3, v19 +; GFX940-NEXT: v_max_f32_e32 v37, v4, v20 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v18, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v3, v19 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc +; GFX940-NEXT: v_max_f32_e32 v38, v5, v21 +; GFX940-NEXT: v_max_f32_e32 v39, v6, v22 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v19, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v4, v20 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc +; GFX940-NEXT: v_max_f32_e32 v48, v7, v23 +; GFX940-NEXT: v_max_f32_e32 v49, v8, v24 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v20, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v5, v21 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc +; GFX940-NEXT: v_max_f32_e32 v50, v9, v25 +; GFX940-NEXT: v_max_f32_e32 v51, v10, v26 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v21, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v6, v22 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc +; GFX940-NEXT: v_max_f32_e32 v52, v11, v27 +; GFX940-NEXT: v_max_f32_e32 v53, v12, v28 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v22, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc +; GFX940-NEXT: v_max_f32_e32 v54, v13, v29 +; GFX940-NEXT: v_max_f32_e32 v55, v14, v30 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v23, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_max_f32_e32 v16, v15, v31 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v8, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v24, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v9, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v25, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v10, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v26, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v11, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v27, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v12, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v28, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v13, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v29, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v14, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v30, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_gt_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v15, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v31, 64 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v16 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v18, v2, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v35, v19, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v20, v4, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v21, v5, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v38, v22, v6, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v23 -; GFX10-NEXT: v_cndmask_b32_e32 v39, v23, v7, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v48, v24, v8, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v25 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v25, v9, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v50, v26, v10, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v27, v11, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v52, v28, v12, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v29, v13, vcc_lo -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v30 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v30, v14, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v32, v0, v16 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v33, v1, v17 +; GFX10-NEXT: v_max_f32_e32 v34, v2, v18 +; GFX10-NEXT: v_max_f32_e32 v35, v3, v19 +; GFX10-NEXT: v_max_f32_e32 v36, v4, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v32, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v37, v5, v21 +; GFX10-NEXT: v_max_f32_e32 v38, v6, v22 +; GFX10-NEXT: v_max_f32_e32 v39, v7, v23 +; GFX10-NEXT: v_max_f32_e32 v48, v8, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v33, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v49, v9, v25 +; GFX10-NEXT: v_max_f32_e32 v50, v10, v26 +; GFX10-NEXT: v_max_f32_e32 v51, v11, v27 +; GFX10-NEXT: v_max_f32_e32 v52, v12, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v34, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v53, v13, v29 +; GFX10-NEXT: v_max_f32_e32 v54, v14, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v35, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v36, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v37, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v38, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v23 -; GFX10-NEXT: v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v39, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v48, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v25 -; GFX10-NEXT: v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v49, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v50, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc00000, v51, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc00000, v52, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc00000, v53, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v30 -; GFX10-NEXT: v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v16, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v17, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v18, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v19, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v20, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v21, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v22, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v23, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v24, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v25, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v26, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v27, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v28, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v29, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v30, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc00000, v54, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo +; GFX10-NEXT: v_max_f32_e32 v16, v15, v31 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v31, 64 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc00000, v16, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v16f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v16 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v1, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v18, v2, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v19, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v4, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v20, v4, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v5, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v21, v5, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v38, v22, v6, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v7, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v23, v7, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v8, v24 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v24, v8, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v9, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v49, v25, v9, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v10, v26 -; GFX11-NEXT: v_cndmask_b32_e32 v50, v26, v10, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v11, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v51, v27, v11, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v12, v28 -; GFX11-NEXT: v_cndmask_b32_e32 v52, v28, v12, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v13, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v53, v29, v13, vcc_lo -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v14, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v54, v30, v14, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v32, v0, v16 :: v_dual_max_f32 v33, v1, v17 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v34, v2, v18 :: v_dual_max_f32 v35, v3, v19 +; GFX11-NEXT: v_dual_max_f32 v36, v4, v20 :: v_dual_max_f32 v37, v5, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v32, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v54, v14, v30 +; GFX11-NEXT: v_dual_max_f32 v38, v6, v22 :: v_dual_max_f32 v39, v7, v23 +; GFX11-NEXT: v_dual_max_f32 v48, v8, v24 :: v_dual_max_f32 v49, v9, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v33, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo +; GFX11-NEXT: v_dual_max_f32 v50, v10, v26 :: v_dual_max_f32 v51, v11, v27 +; GFX11-NEXT: v_dual_max_f32 v52, v12, v28 :: v_dual_max_f32 v53, v13, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v34, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v35, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v36, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v37, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v38, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v39, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v24 -; GFX11-NEXT: v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v48, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v49, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v26 -; GFX11-NEXT: v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v50, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc00000, v51, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v28 -; GFX11-NEXT: v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc00000, v52, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc00000, v53, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v16, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v17, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v18, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v19, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v20, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v21, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v22, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v23, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v24, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v25, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v26, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v27, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v28, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v29, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v30, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc00000, v54, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v15, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo +; GFX11-NEXT: v_max_f32_e32 v16, v15, v31 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v31, 64 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc00000, v16, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll index d60a28e..78fb231 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f64.ll @@ -530,221 +530,86 @@ define <2 x double> @v_maximum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX7-LABEL: v_maximum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX8-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[4:5], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v10, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[4:5] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v13, v6, v2, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7ff80000, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v10, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v13, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s7, 0, v[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s8, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s8 +; GFX10-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v6, v2, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0x7ff80000, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v10, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v13, s2 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[6:7], 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v12, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_eq_f64_e64 s3, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v0, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v1, s3 +; GFX11-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f64: @@ -765,182 +630,43 @@ define <2 x double> @v_maximum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX7-LABEL: v_maximum_v2f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[4:5], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 64 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 64 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v6, v2, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s7, 0, v[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s8, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s8 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[6:7], 64 -; GFX11-NEXT: v_dual_cndmask_b32 v9, v5, v1 :: v_dual_cndmask_b32 v8, v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v11, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v6, v2, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s3, 0, v[8:9] -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v0, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v1, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f64__nnan: @@ -961,111 +687,86 @@ define <2 x double> @v_maximum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX7-LABEL: v_maximum_v2f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX7-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX7-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX8-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX9-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX940-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v9, v6, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s6 +; GFX10-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[6:7] -; GFX11-NEXT: v_dual_cndmask_b32 v8, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_max_f64 v[8:9], v[0:1], v[4:5] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v6, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v9, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f64__nsz: @@ -1086,69 +787,43 @@ define <2 x double> @v_maximum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX7-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v2f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f64__nnan_nsz: @@ -1170,61 +845,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 64 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX7-NEXT: s_cselect_b32 s16, s7, s11 -; GFX7-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX7-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX7-NEXT: s_cselect_b32 s14, s6, s10 -; GFX7-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 64 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s14, s14, 0 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX7-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s7, s7, s15 -; GFX7-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX7-NEXT: s_cselect_b32 s7, s11, s7 -; GFX7-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX7-NEXT: s_cselect_b32 s7, s7, s15 -; GFX7-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX7-NEXT: s_cselect_b32 s6, s6, s14 -; GFX7-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX7-NEXT: s_cselect_b32 s6, s10, s6 -; GFX7-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX7-NEXT: s_cselect_b32 s6, s6, s14 -; GFX7-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX7-NEXT: s_cselect_b32 s14, s5, s9 -; GFX7-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX7-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX7-NEXT: s_cselect_b32 s10, s4, s8 -; GFX7-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 64 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 64 -; GFX7-NEXT: s_cselect_b32 s10, s10, 0 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX7-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX7-NEXT: s_cselect_b32 s5, s5, s11 -; GFX7-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s5, s9, s5 -; GFX7-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s5, s5, s11 -; GFX7-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX7-NEXT: s_cselect_b32 s4, s4, s10 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s4, s8, s4 -; GFX7-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s4, s4, s10 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX7-NEXT: ;;#ASMSTART -; GFX7-NEXT: ; use s[4:7] +; GFX7-NEXT: ; use v[0:3] ; GFX7-NEXT: ;;#ASMEND ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1232,61 +866,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 64 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX8-NEXT: s_cselect_b32 s16, s7, s11 -; GFX8-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX8-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX8-NEXT: s_cselect_b32 s14, s6, s10 -; GFX8-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 64 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s14, s14, 0 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s7, s7, s15 -; GFX8-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX8-NEXT: s_cselect_b32 s7, s11, s7 -; GFX8-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX8-NEXT: s_cselect_b32 s7, s7, s15 -; GFX8-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX8-NEXT: s_cselect_b32 s6, s6, s14 -; GFX8-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX8-NEXT: s_cselect_b32 s6, s10, s6 -; GFX8-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX8-NEXT: s_cselect_b32 s6, s6, s14 -; GFX8-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX8-NEXT: s_cselect_b32 s14, s5, s9 -; GFX8-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX8-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX8-NEXT: s_cselect_b32 s10, s4, s8 -; GFX8-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 64 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 64 -; GFX8-NEXT: s_cselect_b32 s10, s10, 0 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX8-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s11 -; GFX8-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s5, s9, s5 -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s11 -; GFX8-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX8-NEXT: s_cselect_b32 s4, s4, s10 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s4, s8, s4 -; GFX8-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s4, s4, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use s[4:7] +; GFX8-NEXT: ; use v[0:3] ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1294,61 +887,20 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 64 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX9-NEXT: s_cselect_b32 s16, s7, s11 -; GFX9-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX9-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX9-NEXT: s_cselect_b32 s14, s6, s10 -; GFX9-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 64 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s14, s14, 0 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s7, s7, s15 -; GFX9-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX9-NEXT: s_cselect_b32 s7, s11, s7 -; GFX9-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX9-NEXT: s_cselect_b32 s7, s7, s15 -; GFX9-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX9-NEXT: s_cselect_b32 s6, s6, s14 -; GFX9-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s6 -; GFX9-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX9-NEXT: s_cselect_b32 s6, s6, s14 -; GFX9-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX9-NEXT: s_cselect_b32 s14, s5, s9 -; GFX9-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX9-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX9-NEXT: s_cselect_b32 s10, s4, s8 -; GFX9-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 64 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 64 -; GFX9-NEXT: s_cselect_b32 s10, s10, 0 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX9-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_max_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_max_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[4:7] +; GFX9-NEXT: ; use v[0:3] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1356,179 +908,52 @@ define void @s_maximum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: s_and_b64 s[8:9], vcc, exec -; GFX940-NEXT: v_cmp_o_f64_e64 s[8:9], s[2:3], v[0:1] -; GFX940-NEXT: s_cselect_b32 s12, s3, s7 -; GFX940-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX940-NEXT: s_cselect_b32 s11, s12, 0x7ff80000 -; GFX940-NEXT: s_and_b64 s[12:13], vcc, exec -; GFX940-NEXT: s_cselect_b32 s10, s2, s6 -; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[12:13], s[2:3], 64 -; GFX940-NEXT: s_cselect_b32 s10, s10, 0 -; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[14:15], s[6:7], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[8:9], s[10:11], 0 -; GFX940-NEXT: s_cselect_b32 s3, s3, s11 -; GFX940-NEXT: s_and_b64 s[16:17], s[14:15], exec -; GFX940-NEXT: s_cselect_b32 s3, s7, s3 -; GFX940-NEXT: s_and_b64 s[16:17], s[8:9], exec -; GFX940-NEXT: s_cselect_b32 s7, s3, s11 -; GFX940-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s11, s2, s10 -; GFX940-NEXT: s_and_b64 s[2:3], s[14:15], exec +; GFX940-NEXT: v_max_f64 v[2:3], s[2:3], v[0:1] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: s_cselect_b32 s6, s6, s11 -; GFX940-NEXT: s_and_b64 s[2:3], s[8:9], exec -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_cselect_b32 s6, s6, s10 -; GFX940-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX940-NEXT: v_cmp_o_f64_e64 s[2:3], s[0:1], v[0:1] -; GFX940-NEXT: s_cselect_b32 s10, s1, s5 -; GFX940-NEXT: s_and_b64 s[8:9], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s9, s10, 0x7ff80000 -; GFX940-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX940-NEXT: s_cselect_b32 s8, s0, s4 -; GFX940-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[10:11], s[0:1], 64 -; GFX940-NEXT: s_cselect_b32 s8, s8, 0 -; GFX940-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[12:13], s[4:5], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], s[8:9], 0 -; GFX940-NEXT: s_cselect_b32 s1, s1, s9 -; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s1, s5, s1 -; GFX940-NEXT: s_and_b64 s[14:15], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s5, s1, s9 -; GFX940-NEXT: s_and_b64 s[10:11], s[10:11], exec -; GFX940-NEXT: s_cselect_b32 s9, s0, s8 -; GFX940-NEXT: s_and_b64 s[0:1], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s4, s4, s9 -; GFX940-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s4, s4, s8 +; GFX940-NEXT: v_max_f64 v[4:5], s[0:1], v[0:1] +; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ; use v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_maximum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e64 s12, s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s14, s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_class_f64_e64 s15, s[6:7], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s16, s[10:11], 64 -; GFX10-NEXT: v_cmp_o_f64_e64 s18, s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_class_f64_e64 s19, s[4:5], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s20, s[8:9], 64 -; GFX10-NEXT: s_and_b32 s13, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s13, s7, s11 -; GFX10-NEXT: s_and_b32 s17, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s13, s13, 0x7ff80000 -; GFX10-NEXT: s_and_b32 s12, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s12, s6, s10 -; GFX10-NEXT: s_and_b32 s14, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s12, s12, 0 -; GFX10-NEXT: v_cmp_gt_f64_e64 s17, s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, s[12:13], 0 -; GFX10-NEXT: s_and_b32 s21, s15, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s7, s13 -; GFX10-NEXT: s_and_b32 s21, s16, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s11, s7 -; GFX10-NEXT: s_and_b32 s11, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s7, s13 -; GFX10-NEXT: s_and_b32 s11, s15, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s6, s12 -; GFX10-NEXT: s_and_b32 s11, s16, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_and_b32 s10, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s6, s12 -; GFX10-NEXT: s_and_b32 s10, s17, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s5, s9 -; GFX10-NEXT: s_and_b32 s11, s18, exec_lo -; GFX10-NEXT: s_cselect_b32 s11, s10, 0x7ff80000 -; GFX10-NEXT: s_and_b32 s10, s17, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s4, s8 -; GFX10-NEXT: s_and_b32 s12, s18, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s10, 0 -; GFX10-NEXT: s_and_b32 s13, s19, exec_lo -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, s[10:11], 0 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s13, s20, exec_lo -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_and_b32 s9, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s9, s19, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 -; GFX10-NEXT: s_and_b32 s9, s20, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_and_b32 s8, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 +; GFX10-NEXT: v_max_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use s[4:7] +; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_maximum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e64 s8, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s10, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_class_f64_e64 s11, s[2:3], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s12, s[6:7], 64 -; GFX11-NEXT: v_cmp_o_f64_e64 s14, s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_class_f64_e64 s15, s[0:1], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s16, s[4:5], 64 -; GFX11-NEXT: s_and_b32 s9, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s9, s3, s7 -; GFX11-NEXT: s_and_b32 s13, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s9, s9, 0x7ff80000 -; GFX11-NEXT: s_and_b32 s8, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s8, s2, s6 -; GFX11-NEXT: s_and_b32 s10, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s8, s8, 0 -; GFX11-NEXT: v_cmp_gt_f64_e64 s13, s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, s[8:9], 0 -; GFX11-NEXT: s_and_b32 s17, s11, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s9 -; GFX11-NEXT: s_and_b32 s17, s12, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s7, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s9 -; GFX11-NEXT: s_and_b32 s7, s11, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s7, s12, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s6, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: s_and_b32 s6, s13, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s1, s5 -; GFX11-NEXT: s_and_b32 s7, s14, exec_lo -; GFX11-NEXT: s_cselect_b32 s7, s6, 0x7ff80000 -; GFX11-NEXT: s_and_b32 s6, s13, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s0, s4 -; GFX11-NEXT: s_and_b32 s8, s14, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: s_and_b32 s9, s15, exec_lo -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, s[6:7], 0 -; GFX11-NEXT: s_cselect_b32 s1, s1, s7 -; GFX11-NEXT: s_and_b32 s9, s16, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s5, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s1, s7 -; GFX11-NEXT: s_and_b32 s5, s15, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s5, s16, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s4, s0 -; GFX11-NEXT: s_and_b32 s4, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s6 +; GFX11-NEXT: v_max_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_max_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s[0:3] +; GFX11-NEXT: ; use v[0:3] ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,306 +979,110 @@ define <3 x double> @v_maximum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX7-LABEL: v_maximum_v3f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 64 -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX7-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX7-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 64 -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX8-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 64 -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[12:13] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v11, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] +; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v10, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v10, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7ff80000, v12, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0x7ff80000, v14, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, v17, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, v18, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, v19, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[8:9], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[6:7], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[12:13] -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[14:15] -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[16:17] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s11 +; GFX10-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v12, v7, v1 :: v_dual_cndmask_b32 v17, v6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v10, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v13, 0x7ff80000, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v15, 0x7ff80000, v14, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, v17, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, v18, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, v19, s4 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[8:9], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[10:11], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[12:13] -; GFX11-NEXT: v_cmp_eq_f64_e64 s6, 0, v[14:15] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v8, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, v1, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s7 +; GFX11-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f64: @@ -1875,247 +1104,49 @@ define <3 x double> @v_maximum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX7-LABEL: v_maximum_v3f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 64 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[12:13] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 64 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[0:1] -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v11, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v6, v10, v4, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[2:3] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[8:9], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[6:7], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 64 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v17, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v10, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[12:13] -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[14:15] -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[16:17] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s11 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[8:9], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[10:11], 64 -; GFX11-NEXT: v_dual_cndmask_b32 v13, v7, v1 :: v_dual_cndmask_b32 v12, v6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v10, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[12:13] -; GFX11-NEXT: v_cmp_eq_f64_e64 s6, 0, v[14:15] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v8, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, v1, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s7 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f64__nnan: @@ -2137,144 +1168,110 @@ define <3 x double> @v_maximum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX7-LABEL: v_maximum_v3f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX7-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX7-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX8-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX9-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v10, v4, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GFX940-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX940-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX940-NEXT: v_max_f64 v[6:7], v[4:5], v[10:11] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v12, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s8 +; GFX10-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v12, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s4 +; GFX11-NEXT: v_max_f64 v[12:13], v[0:1], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX11-NEXT: v_max_f64 v[6:7], v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[8:9], v[4:5], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f64__nsz: @@ -2296,88 +1293,49 @@ define <3 x double> @v_maximum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX7-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v3f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v5, s1 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f64__nnan_nsz: @@ -2399,404 +1357,135 @@ define <4 x double> @v_maximum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX7-LABEL: v_maximum_v4f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX7-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX7-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX7-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX8-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[2:3] +; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[12:13], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[14:15], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s6, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_gt_f64_e64 s8, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s10, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[14:15], 64 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v21, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v13, v5, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v15, v7, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, v19, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7ff80000, v18, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v12, v4, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, v21, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v14, v6, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[4:5], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[6:7], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7ff80000, v20, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, v23, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7ff80000, v22, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v24, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s9, v[10:11], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s6, 0, v[16:17] -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[18:19] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[20:21] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[22:23] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s14 +; GFX10-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_gt_f64_e64 s2, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_gt_f64_e64 s3, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_o_f64_e64 s6, v[6:7], v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v18, v11, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v13, v5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v22, v15, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v19, 0x7ff80000, v18, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v21, 0x7ff80000, v20, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v10, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v12, v4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v14, v6, s3 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v23, 0x7ff80000, v22, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v18, 0, v18, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, v20, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, v24, s6 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[12:13], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[14:15], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[10:11], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, 0, v[18:19] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[20:21] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, 0, v[22:23] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, v16, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v14, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v15, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s8 +; GFX11-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[12:13] +; GFX11-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f64: @@ -2819,320 +1508,55 @@ define <4 x double> @v_maximum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX7-LABEL: v_maximum_v4f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX7-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 64 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v13, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[12:13], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v15, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[14:15], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[2:3] +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_gt_f64_e64 s6, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[10:11], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[8:9], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[14:15], 64 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v15, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v12, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[4:5], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v14, v6, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[6:7], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[2:3], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[16:17] -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[18:19] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[20:21] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[22:23] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s14 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_gt_f64_e64 s2, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[14:15], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[10:11], 64 -; GFX11-NEXT: v_dual_cndmask_b32 v17, v9, v1 :: v_dual_cndmask_b32 v16, v8, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v21, v13, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v12, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v23, v15, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v22, v14, v6, s2 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[12:13], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, 0, v[18:19] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[20:21] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, 0, v[22:23] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v14, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v15, s4 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s10 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f64__nnan: @@ -3155,180 +1579,135 @@ define <4 x double> @v_maximum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX7-LABEL: v_maximum_v4f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX7-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX7-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX7-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX8-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX9-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v4, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GFX940-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[4:5], v[12:13] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX940-NEXT: v_max_f64 v[8:9], v[6:7], v[14:15] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_gt_f64_e64 s7, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s10, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v12, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v14, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v16, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v10, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7ff80000, v7, s10 +; GFX10-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_gt_f64_e64 s2, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_gt_f64_e64 s3, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_o_f64_e64 s6, v[6:7], v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v16, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v12, v4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v15, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v10, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v12, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0x7ff80000, v7, s6 +; GFX11-NEXT: v_max_f64 v[16:17], v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[8:9], v[2:3], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[10:11], v[4:5], v[12:13] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[12:13] +; GFX11-NEXT: v_max_f64 v[12:13], v[6:7], v[14:15] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f64__nsz: @@ -3351,108 +1730,55 @@ define <4 x double> @v_maximum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX7-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX7-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GFX940-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX940-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX940-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX940-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_gt_f64_e64 s6, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s6 +; GFX10-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v4f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_gt_f64_e64 s2, v[6:7], v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v14, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v15, v7, s2 +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_max_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_max_f64 v[6:7], v[6:7], v[14:15] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f64__nnan_nsz: @@ -3475,782 +1801,244 @@ define <8 x double> @v_maximum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX7-LABEL: v_maximum_v8f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX7-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 64 -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX7-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX7-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX7-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX7-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] +; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] +; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] +; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX7-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v8f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX8-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 64 -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX8-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX8-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX8-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX8-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] +; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] +; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] +; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX8-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v8f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 64 -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX9-NEXT: v_max_f64 v[32:33], v[2:3], v[18:19] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX9-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-NEXT: v_max_f64 v[2:3], v[0:1], v[16:17] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX9-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-NEXT: v_max_f64 v[16:17], v[8:9], v[24:25] +; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX9-NEXT: v_max_f64 v[22:23], v[10:11], v[26:27] +; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX9-NEXT: v_max_f64 v[24:25], v[12:13], v[28:29] +; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX9-NEXT: v_max_f64 v[18:19], v[14:15], v[30:31] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v8f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v35, v32, v33, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v33, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v34, 0, v33, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[16:17], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[34:35] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v34, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v35, v1, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[18:19] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v34, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v35, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[18:19], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[20:21], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[22:23] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[22:23], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[24:25], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[10:11], v[26:27] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[26:27], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v16, v10, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[28:29], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX940-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX940-NEXT: v_max_f64 v[34:35], v[2:3], v[18:19] +; GFX940-NEXT: v_max_f64 v[36:37], v[4:5], v[20:21] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX940-NEXT: v_max_f64 v[38:39], v[6:7], v[22:23] +; GFX940-NEXT: v_max_f64 v[48:49], v[8:9], v[24:25] +; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX940-NEXT: v_max_f64 v[50:51], v[10:11], v[26:27] +; GFX940-NEXT: v_max_f64 v[52:53], v[12:13], v[28:29] +; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[30:31], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v17, v15, s[2:3] +; GFX940-NEXT: v_max_f64 v[16:17], v[14:15], v[30:31] +; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v8f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[16:17] -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[0:1], v[16:17] -; GFX10-NEXT: v_cmp_gt_f64_e64 s9, v[6:7], v[22:23] -; GFX10-NEXT: v_cmp_gt_f64_e64 s10, v[8:9], v[24:25] -; GFX10-NEXT: v_cmp_gt_f64_e64 s11, v[10:11], v[26:27] -; GFX10-NEXT: v_cmp_gt_f64_e64 s12, v[12:13], v[28:29] -; GFX10-NEXT: v_cmp_o_f64_e64 s13, v[6:7], v[22:23] -; GFX10-NEXT: v_cmp_o_f64_e64 s14, v[8:9], v[24:25] -; GFX10-NEXT: v_cmp_o_f64_e64 s15, v[10:11], v[26:27] -; GFX10-NEXT: v_cmp_o_f64_e64 s16, v[12:13], v[28:29] -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[2:3], v[18:19] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[18:19] -; GFX10-NEXT: v_cmp_gt_f64_e64 s7, v[4:5], v[20:21] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[20:21] -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[26:27], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s18, v[28:29], 64 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v17, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v38, v23, v7, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v25, v9, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v27, v11, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0x7ff80000, v32, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v52, v29, v13, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v38, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v48, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, v32, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v52, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v22, v6, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v24, v8, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v26, v10, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v28, v12, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[16:17], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[18:19], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v19, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v38, 0, v38, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v21, v5, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 s9, v[12:13], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v48, 0, v48, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0x7ff80000, v34, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v18, v2, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v20, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, v34, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v36, 0, v36, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[8:9], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v34, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v50, 0, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v52, 0, v52, s16 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[20:21], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v16, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v18, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s15, v[22:23], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s16, v[24:25], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s19, 0, v[32:33] -; GFX10-NEXT: v_cmp_eq_f64_e64 s20, 0, v[34:35] -; GFX10-NEXT: v_cmp_eq_f64_e64 s21, 0, v[36:37] -; GFX10-NEXT: v_cmp_eq_f64_e64 s22, 0, v[48:49] -; GFX10-NEXT: v_cmp_eq_f64_e64 s23, 0, v[50:51] -; GFX10-NEXT: v_cmp_eq_f64_e64 s24, 0, v[52:53] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v12, v52, v12, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v36, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v35, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v38, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v37, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v50, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v39, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, v9, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v51, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v53, v13, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v20, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v26, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v22, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v24, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v28, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v19, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v21, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v23, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v25, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v27, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v29, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v32, v0, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v34, v2, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v36, v4, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v50, v10, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v52, v12, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v33, v1, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v35, v3, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v37, v5, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, v9, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v51, v11, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v53, v13, s24 +; GFX10-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[16:17] +; GFX10-NEXT: v_max_f64 v[16:17], v[2:3], v[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[18:19] +; GFX10-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[20:21] +; GFX10-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[22:23] +; GFX10-NEXT: v_max_f64 v[22:23], v[8:9], v[24:25] +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[24:25] +; GFX10-NEXT: v_max_f64 v[24:25], v[10:11], v[26:27] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[26:27] +; GFX10-NEXT: v_max_f64 v[26:27], v[12:13], v[28:29] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[28:29] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v33, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v22, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v23, 0x7ff80000, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v24, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v25, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v26, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v27, 0x7ff80000, s9 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_gt_f64_e64 s10, v[14:15], v[30:31] -; GFX10-NEXT: v_cmp_o_f64_e64 s13, v[14:15], v[30:31] -; GFX10-NEXT: v_cmp_class_f64_e64 s25, v[30:31], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v31, v15, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v30, v14, s10 -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v16, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v54, 0, v18, s13 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[14:15], 64 -; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[54:55] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v38, v6, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v39, v7, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v54, v14, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v55, v15, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v30, s25 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v31, s25 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v55, v15, vcc_lo +; GFX10-NEXT: v_max_f64 v[28:29], v[14:15], v[30:31] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[30:31] +; GFX10-NEXT: v_cndmask_b32_e64 v14, v28, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v29, 0x7ff80000, s10 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v8f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cmp_gt_f64_e64 s4, v[6:7], v[22:23] -; GFX11-NEXT: v_cmp_o_f64_e64 s9, v[6:7], v[22:23] -; GFX11-NEXT: v_cmp_gt_f64_e64 s1, v[2:3], v[18:19] -; GFX11-NEXT: v_cmp_gt_f64_e64 s6, v[10:11], v[26:27] -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[16:17] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[18:19] -; GFX11-NEXT: v_cmp_o_f64_e64 s11, v[10:11], v[26:27] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[16:17] -; GFX11-NEXT: v_cmp_gt_f64_e64 s3, v[4:5], v[20:21] -; GFX11-NEXT: v_cmp_gt_f64_e64 s5, v[8:9], v[24:25] -; GFX11-NEXT: v_cmp_gt_f64_e64 s7, v[12:13], v[28:29] -; GFX11-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[20:21] -; GFX11-NEXT: v_cmp_o_f64_e64 s10, v[8:9], v[24:25] -; GFX11-NEXT: v_cmp_o_f64_e64 s12, v[12:13], v[28:29] -; GFX11-NEXT: v_cmp_class_f64_e64 s13, v[18:19], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s15, v[20:21], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v38, v23, v7, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v34, v19, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v50, v27, v11, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v38, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v38, v22, v6, s4 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[6:7], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v35, 0x7ff80000, v34, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v50, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v34, v18, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v50, v26, v10, s6 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[0:1], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v36, v21, v5, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v48, v25, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v52, v29, v13, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v50, 0, v50, s11 -; GFX11-NEXT: v_cmp_class_f64_e64 s11, v[16:17], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v48, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v52, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v36, v20, v4, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v48, v24, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v52, v28, v12, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v34, 0, v34, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v38, 0, v38, s9 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[2:3], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[4:5], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s7, v[10:11], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s9, v[12:13], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v36, 0, v36, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v48, 0, v48, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v52, 0, v52, s12 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[24:25], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s8, v[26:27], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[28:29], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s14, 0, v[34:35] -; GFX11-NEXT: v_cmp_eq_f64_e64 s16, 0, v[36:37] -; GFX11-NEXT: v_cmp_eq_f64_e64 s17, 0, v[38:39] -; GFX11-NEXT: v_cmp_eq_f64_e64 s18, 0, v[48:49] -; GFX11-NEXT: v_cmp_eq_f64_e64 s20, 0, v[50:51] -; GFX11-NEXT: v_cmp_eq_f64_e64 s21, 0, v[52:53] -; GFX11-NEXT: v_cndmask_b32_e64 v7, v39, v7, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v17, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v38, v6, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v33, 0x7ff80000, v32, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v32, 0, v32, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, v0, s1 -; GFX11-NEXT: v_cmp_eq_f64_e64 s12, 0, v[32:33] -; GFX11-NEXT: v_cndmask_b32_e64 v2, v34, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v36, v4, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v48, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v50, v10, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v52, v12, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v35, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v37, v5, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v49, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v51, v11, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v53, v13, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v18, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v20, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v24, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v26, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v28, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v19, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v21, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v25, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v27, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v29, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v34, v2, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v36, v4, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v48, v8, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v50, v10, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v52, v12, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v35, v3, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v37, v5, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v49, v9, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v51, v11, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v53, v13, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, v0, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, v1, s12 +; GFX11-NEXT: v_max_f64 v[32:33], v[0:1], v[16:17] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[16:17] +; GFX11-NEXT: v_max_f64 v[16:17], v[2:3], v[18:19] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[18:19] +; GFX11-NEXT: v_max_f64 v[18:19], v[4:5], v[20:21] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[20:21] +; GFX11-NEXT: v_max_f64 v[20:21], v[6:7], v[22:23] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[22:23] +; GFX11-NEXT: v_max_f64 v[22:23], v[8:9], v[24:25] +; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[24:25] +; GFX11-NEXT: v_max_f64 v[24:25], v[10:11], v[26:27] +; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[26:27] +; GFX11-NEXT: v_max_f64 v[26:27], v[12:13], v[28:29] +; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[28:29] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v16, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v17, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v22, 0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v23, 0x7ff80000, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v24, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v25, 0x7ff80000, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v26, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v27, 0x7ff80000, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[14:15], v[30:31] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[14:15], v[30:31] -; GFX11-NEXT: v_cmp_class_f64_e64 s19, v[30:31], 64 -; GFX11-NEXT: v_cndmask_b32_e32 v54, v31, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v54, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v54, 0, v16, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[14:15], 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_f64_e64 s22, 0, v[54:55] -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_cndmask_b32 v6, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v54, v14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v55, v15, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v38, v6, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v39, v7, s17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v30, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v31, s19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v14, v54, v14, s22 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v55, v15, s22 +; GFX11-NEXT: v_max_f64 v[28:29], v[14:15], v[30:31] +; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[30:31] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v14, v28, 0, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v29, 0x7ff80000, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v8f64: @@ -4279,1799 +2067,798 @@ define <16 x double> @v_maximum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_maximum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX7-NEXT: v_writelane_b32 v34, s30, 0 +; GFX7-NEXT: v_writelane_b32 v34, s31, 1 +; GFX7-NEXT: v_writelane_b32 v34, s34, 2 +; GFX7-NEXT: v_writelane_b32 v34, s35, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX7-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX7-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX7-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX7-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX7-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX7-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 64 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX7-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX7-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX7-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX7-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 64 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX7-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX7-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX7-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX7-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX7-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX7-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX7-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 64 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX7-NEXT: v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX7-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] +; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX7-NEXT: v_readlane_b32 s35, v34, 3 +; GFX7-NEXT: v_readlane_b32 s34, v34, 2 +; GFX7-NEXT: v_readlane_b32 s31, v34, 1 +; GFX7-NEXT: v_readlane_b32 s30, v34, 0 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 64 -; GFX7-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 64 -; GFX7-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 64 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_maximum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX8-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX8-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_writelane_b32 v34, s34, 2 +; GFX8-NEXT: v_writelane_b32 v34, s35, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX8-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX8-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX8-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX8-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX8-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX8-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX8-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 64 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX8-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX8-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX8-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX8-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX8-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 64 -; GFX8-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX8-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX8-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX8-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX8-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX8-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX8-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX8-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 64 -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX8-NEXT: v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX8-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] +; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX8-NEXT: v_readlane_b32 s35, v34, 3 +; GFX8-NEXT: v_readlane_b32 s34, v34, 2 +; GFX8-NEXT: v_readlane_b32 s31, v34, 1 +; GFX8-NEXT: v_readlane_b32 s30, v34, 0 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 64 -; GFX8-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 64 -; GFX8-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 64 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_maximum_v16f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX9-NEXT: v_writelane_b32 v34, s30, 0 +; GFX9-NEXT: v_writelane_b32 v34, s31, 1 +; GFX9-NEXT: v_writelane_b32 v34, s34, 2 +; GFX9-NEXT: v_writelane_b32 v34, s35, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX9-NEXT: v_max_f64 v[2:3], v[2:3], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX9-NEXT: v_max_f64 v[4:5], v[4:5], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX9-NEXT: v_max_f64 v[6:7], v[6:7], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX9-NEXT: v_max_f64 v[8:9], v[8:9], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX9-NEXT: v_max_f64 v[10:11], v[10:11], v[31:32] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 64 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[48:49] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX9-NEXT: v_max_f64 v[12:13], v[12:13], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX9-NEXT: v_max_f64 v[14:15], v[14:15], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX9-NEXT: v_max_f64 v[16:17], v[16:17], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX9-NEXT: v_max_f64 v[18:19], v[18:19], v[31:32] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[18:19], v[35:36] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 64 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX9-NEXT: v_max_f64 v[20:21], v[20:21], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX9-NEXT: v_max_f64 v[22:23], v[22:23], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX9-NEXT: v_max_f64 v[24:25], v[24:25], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX9-NEXT: v_max_f64 v[26:27], v[26:27], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX9-NEXT: v_max_f64 v[28:29], v[28:29], v[31:32] ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 64 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 64 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX9-NEXT: v_cmp_gt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX9-NEXT: v_max_f64 v[30:31], v[30:31], v[32:33] +; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX9-NEXT: v_readlane_b32 s35, v34, 3 +; GFX9-NEXT: v_readlane_b32 s34, v34, 2 +; GFX9-NEXT: v_readlane_b32 s31, v34, 1 +; GFX9-NEXT: v_readlane_b32 s30, v34, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 64 -; GFX9-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 64 -; GFX9-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 64 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_maximum_v16f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:28 +; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:124 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:116 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:36 +; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_max_f64 v[58:59], v[2:3], v[36:37] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] ; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 ; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:100 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:44 -; GFX940-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX940-NEXT: v_mov_b32_e32 v56, 0x7ff80000 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_max_f64 v[60:61], v[4:5], v[38:39] +; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_max_f64 v[62:63], v[6:7], v[48:49] +; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_max_f64 v[2:3], v[0:1], v[56:57] +; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 ; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[0:1], v[40:41] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v57, v41, v1, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[40:41] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v40, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v57, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[40:41], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v58, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v59, v1, vcc +; GFX940-NEXT: v_max_f64 v[56:57], v[8:9], v[46:47] +; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] ; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[2:3], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v41, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v51, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e32 v40, v50, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 64 -; GFX940-NEXT: v_cndmask_b32_e64 v61, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v60, 0, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v60, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v61, v3, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[50:51], 64 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v50, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v51, vcc -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[44:45] -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v58, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v59, v1, vcc -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[60:61] +; GFX940-NEXT: v_max_f64 v[46:47], v[10:11], v[44:45] +; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX940-NEXT: s_waitcnt vmcnt(19) +; GFX940-NEXT: v_max_f64 v[44:45], v[12:13], v[42:43] +; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX940-NEXT: s_waitcnt vmcnt(17) +; GFX940-NEXT: v_max_f64 v[42:43], v[14:15], v[40:41] +; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX940-NEXT: s_waitcnt vmcnt(15) +; GFX940-NEXT: v_max_f64 v[40:41], v[16:17], v[54:55] +; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX940-NEXT: s_waitcnt vmcnt(13) +; GFX940-NEXT: v_max_f64 v[54:55], v[18:19], v[52:53] +; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX940-NEXT: s_waitcnt vmcnt(11) +; GFX940-NEXT: v_max_f64 v[52:53], v[20:21], v[50:51] +; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX940-NEXT: s_waitcnt vmcnt(9) +; GFX940-NEXT: v_max_f64 v[50:51], v[22:23], v[34:35] +; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX940-NEXT: s_waitcnt vmcnt(6) +; GFX940-NEXT: v_max_f64 v[34:35], v[24:25], v[32:33] +; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(4) +; GFX940-NEXT: v_max_f64 v[32:33], v[26:27], v[36:37] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v60, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v61, v3, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[4:5], v[44:45] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e32 v50, v45, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v44, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[44:45], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v58, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v59, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v44, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v45, s[0:1] -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:68 -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(22) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v58, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v59, v5, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[6:7], v[46:47] +; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX940-NEXT: s_waitcnt vmcnt(2) +; GFX940-NEXT: v_max_f64 v[32:33], v[28:29], v[38:39] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v50, v47, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v46, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[46:47], 64 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v58, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v59, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v46, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v47, s[0:1] -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(18) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[8:9], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v58, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v59, v7, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[8:9], v[42:43] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v57, v43, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v42, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v57, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[42:43], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[58:59] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v58, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v59, v9, vcc -; GFX940-NEXT: s_waitcnt vmcnt(8) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[10:11], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v8, v42, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v9, v43, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v42, v55, v11, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[10:11], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v58, v8, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v59, v9, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v43, v56, v42, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v42, v54, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v42, 0, v42, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[54:55], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[42:43] -; GFX940-NEXT: v_cndmask_b32_e32 v10, v42, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v43, v11, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[12:13], v[52:53] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v10, v54, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v11, v55, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v54, v53, v13, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[12:13], v[52:53] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v42, v10, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v43, v11, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v55, v56, v54, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v54, v52, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v54, 0, v54, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[52:53], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[54:55] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v12, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v13, v53, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v41, v15, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v55, v13, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v40, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[40:41], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v14, v52, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v14, v14, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v15, v53, v15, vcc -; GFX940-NEXT: s_waitcnt vmcnt(3) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[16:17], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v52, v14, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v15, v41, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v45, v17, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[16:17], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v53, v15, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v44, v16, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[44:45], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v52, v16, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, v16, v44, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v17, v53, v17, vcc -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[18:19], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v16, v52, v16, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v17, v45, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v47, v19, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[18:19], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v53, v17, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v46, v18, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[46:47], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v18, v52, v18, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v18, v18, v46, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v19, v53, v19, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v18, v52, v18, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v19, v19, v47, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v51, v21, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[20:21], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v19, v53, v19, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v50, v20, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[20:21], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[50:51], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v20, v52, v20, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v21, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[22:23], v[48:49] -; GFX940-NEXT: v_cndmask_b32_e64 v20, v20, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v21, v21, v51, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v49, v23, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[22:23], v[48:49] -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, v20, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v21, v53, v21, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v51, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v48, v22, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v50, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[48:49], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[50:51] -; GFX940-NEXT: v_cndmask_b32_e32 v22, v50, v22, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v23, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[24:25], v[38:39] -; GFX940-NEXT: v_cndmask_b32_e64 v22, v22, v48, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v23, v23, v49, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v48, v39, v25, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[24:25], v[38:39] -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, v22, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v23, v51, v23, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v49, v56, v48, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v48, v38, v24, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[38:39], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[48:49] -; GFX940-NEXT: v_cndmask_b32_e32 v24, v48, v24, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v49, v25, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: v_cndmask_b32_e64 v24, v24, v38, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v25, v25, v39, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v38, v37, v27, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[26:27], v[36:37] -; GFX940-NEXT: v_cndmask_b32_e64 v24, v48, v24, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v25, v49, v25, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v39, v56, v38, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v38, v36, v26, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v38, 0, v38, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[36:37], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[38:39] -; GFX940-NEXT: v_cndmask_b32_e32 v26, v38, v26, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v39, v27, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[28:29], v[34:35] -; GFX940-NEXT: v_cndmask_b32_e64 v26, v26, v36, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v27, v27, v37, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v36, v35, v29, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[28:29], v[34:35] -; GFX940-NEXT: v_cndmask_b32_e64 v26, v38, v26, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v27, v39, v27, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v37, v56, v36, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v36, v34, v28, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[28:29], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[34:35], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[36:37] -; GFX940-NEXT: v_cndmask_b32_e32 v28, v36, v28, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v37, v29, vcc -; GFX940-NEXT: v_cmp_gt_f64_e32 vcc, v[30:31], v[32:33] -; GFX940-NEXT: v_cndmask_b32_e64 v28, v28, v34, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v29, v29, v35, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v34, v33, v31, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[30:31], v[32:33] -; GFX940-NEXT: v_cndmask_b32_e64 v28, v36, v28, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v29, v37, v29, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v35, v56, v34, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v34, v32, v30, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 64 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[32:33], 64 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[34:35] -; GFX940-NEXT: v_cndmask_b32_e32 v30, v34, v30, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v35, v31, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v30, v34, v30, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v31, v35, v31, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX940-NEXT: v_max_f64 v[32:33], v[30:31], v[48:49] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_maximum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x20 +; GFX10-NEXT: s_clause 0x19 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 ; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 ; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: v_max_f64 v[82:83], v[2:3], v[31:32] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[31:32] +; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: v_max_f64 v[84:85], v[4:5], v[33:34] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[4:5], v[33:34] +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: v_max_f64 v[32:33], v[6:7], v[35:36] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[6:7], v[35:36] +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:124 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[0:1], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[2:3], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[10:11], 64 -; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[64:65] -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[0:1], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_cmp_gt_f64_e64 s5, v[2:3], v[54:55] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(27) -; GFX10-NEXT: v_cmp_gt_f64_e64 s7, v[4:5], v[52:53] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_cmp_gt_f64_e64 s9, v[6:7], v[50:51] -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[6:7], v[50:51] +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_cmp_gt_f64_e64 s13, v[8:9], v[48:49] -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[64:65], 64 +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[50:51] ; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_gt_f64_e64 s15, v[12:13], v[36:37] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_o_f64_e64 s16, v[14:15], v[34:35] -; GFX10-NEXT: v_cndmask_b32_e32 v96, v64, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v97, v54, v2, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v99, v55, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v100, v52, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v96, 0, v96, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v101, v50, v6, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v98, 0, v97, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v97, v65, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[54:55], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v96, v0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v99, 0x7ff80000, v99, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v98, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v97, 0x7ff80000, v97, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v100, 0, v100, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v102, 0, v101, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v99, v3, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[6:7], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v97, v1, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[4:5], 64 -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[10:11], v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v112, v48, v8, s13 -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[12:13], v[36:37] -; GFX10-NEXT: v_cmp_gt_f64_e64 s6, v[14:15], v[34:35] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v64, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v65, s14 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[52:53], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v113, v36, v12, s15 +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[52:53] +; GFX10-NEXT: s_waitcnt vmcnt(19) +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[10:11], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(18) +; GFX10-NEXT: v_max_f64 v[34:35], v[8:9], v[37:38] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[8:9], v[37:38] +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: v_max_f64 v[8:9], v[0:1], v[64:65] +; GFX10-NEXT: v_max_f64 v[36:37], v[10:11], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[0:1], v[64:65] +; GFX10-NEXT: v_max_f64 v[38:39], v[12:13], v[52:53] +; GFX10-NEXT: v_max_f64 v[52:53], v[14:15], v[50:51] +; GFX10-NEXT: s_waitcnt vmcnt(11) +; GFX10-NEXT: v_max_f64 v[54:55], v[20:21], v[70:71] +; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] +; GFX10-NEXT: s_waitcnt vmcnt(9) +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_max_f64 v[50:51], v[16:17], v[48:49] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[48:49] +; GFX10-NEXT: v_max_f64 v[48:49], v[18:19], v[80:81] +; GFX10-NEXT: v_max_f64 v[64:65], v[22:23], v[68:69] +; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_max_f64 v[68:69], v[24:25], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v36, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v34, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v35, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v37, 0x7ff80000, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v38, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, 0x7ff80000, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, 0x7ff80000, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v50, 0, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v51, 0x7ff80000, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v48, 0, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v49, 0x7ff80000, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v54, 0, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v55, 0x7ff80000, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v64, 0, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v65, 0x7ff80000, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_max_f64 v[70:71], v[28:29], v[2:3] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_max_f64 v[66:67], v[26:27], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v82, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_o_f64_e64 s18, v[30:31], v[86:87] -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v54, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v54, v53, v5, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[50:51], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v51, v7, s9 -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[8:9], v[48:49] -; GFX10-NEXT: v_cndmask_b32_e64 v101, 0x7ff80000, v54, s8 -; GFX10-NEXT: v_cmp_gt_f64_e64 s7, v[16:17], v[32:33] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v102, v6, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v103, 0x7ff80000, v55, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v100, v4, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v101, v5, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[8:9], 64 -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[10:11], v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v7, v103, v7, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[48:49], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v38, v10, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v115, v34, v14, s6 -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[16:17], v[32:33] -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v52, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v53, s14 -; GFX10-NEXT: v_cmp_gt_f64_e64 s14, v[18:19], v[82:83] -; GFX10-NEXT: v_cndmask_b32_e64 v52, 0, v115, s16 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v50, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v50, v49, v9, s13 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v51, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[38:39], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v54, 0, v112, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v39, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v50, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v50, 0, v113, s5 -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[18:19], v[82:83] -; GFX10-NEXT: v_cndmask_b32_e64 v8, v54, v8, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v64, 0, v114, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v55, v9, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v65, 0x7ff80000, v51, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v48, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v64, v10, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v49, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[14:15], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v65, v11, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v37, v13, s15 -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[34:35], 64 -; GFX10-NEXT: v_cmp_gt_f64_e64 s9, v[20:21], v[66:67] -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[20:21], v[66:67] -; GFX10-NEXT: v_cndmask_b32_e64 v116, v32, v16, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v48, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v38, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v39, vcc_lo -; GFX10-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v38, v35, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v82, v18, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v48, 0, v116, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[36:37], 64 -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v38, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v50, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[16:17], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v38, 0, v49, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v112, v83, v19, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, v14, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, v15, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[32:33], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[18:19], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v67, v21, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v34, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v35, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v33, v17, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v112, s4 -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, v[24:25], v[70:71] -; GFX10-NEXT: v_cndmask_b32_e32 v113, v69, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v35, v68, v22, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[20:21], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v34, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0x7ff80000, v114, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v36, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v37, s13 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[82:83], 64 -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[24:25], v[70:71] -; GFX10-NEXT: v_cndmask_b32_e64 v16, v48, v16, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v49, v17, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v36, 0x7ff80000, v113, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, v35, s5 -; GFX10-NEXT: v_cmp_gt_f64_e64 s7, v[26:27], v[80:81] -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v32, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v66, v20, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v33, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v38, v18, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v39, v19, s14 -; GFX10-NEXT: v_cmp_o_f64_e64 s15, v[26:27], v[80:81] -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, v32, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v112, v71, v25, s4 -; GFX10-NEXT: v_cmp_gt_f64_e64 s16, v[28:29], v[84:85] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[28:29], v[84:85] -; GFX10-NEXT: v_cndmask_b32_e32 v21, v34, v21, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v20, v33, v20, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 64 -; GFX10-NEXT: v_cmp_gt_f64_e64 s17, v[30:31], v[86:87] -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[70:71], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v82, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v70, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v83, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v83, 0x7ff80000, v112, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[68:69], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[96:97] -; GFX10-NEXT: v_cndmask_b32_e64 v82, 0, v82, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v81, v27, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v80, v26, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[80:81], 64 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[84:85], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[98:99] -; GFX10-NEXT: v_cndmask_b32_e64 v113, 0x7ff80000, v37, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v112, 0, v32, s15 -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[100:101] -; GFX10-NEXT: v_cndmask_b32_e64 v115, v85, v29, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v84, v28, s16 -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[102:103] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[54:55] -; GFX10-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[24:25], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v115, 0x7ff80000, v115, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v114, 0, v114, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v116, v87, v31, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v86, v30, s17 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[86:87], 64 -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[64:65] -; GFX10-NEXT: v_cmp_eq_f64_e64 s15, 0, v[50:51] -; GFX10-NEXT: v_cndmask_b32_e64 v117, 0x7ff80000, v116, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v116, 0, v32, s18 -; GFX10-NEXT: v_cmp_eq_f64_e64 s16, 0, v[52:53] -; GFX10-NEXT: v_cmp_eq_f64_e64 s17, 0, v[48:49] -; GFX10-NEXT: v_cmp_eq_f64_e64 s18, 0, v[38:39] -; GFX10-NEXT: v_cmp_eq_f64_e64 s19, 0, v[33:34] -; GFX10-NEXT: v_cmp_eq_f64_e64 s20, 0, v[35:36] -; GFX10-NEXT: v_cmp_eq_f64_e64 s21, 0, v[82:83] -; GFX10-NEXT: v_cmp_eq_f64_e64 s22, 0, v[112:113] -; GFX10-NEXT: v_cmp_eq_f64_e64 s23, 0, v[114:115] -; GFX10-NEXT: v_cmp_eq_f64_e64 s24, 0, v[116:117] -; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v68, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v69, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v96, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v24, v82, v24, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v25, v83, v25, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[26:27], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v98, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v100, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v70, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v71, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v102, v6, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v54, v8, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v64, v10, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v50, v12, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, v14, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v48, v16, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v38, v18, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v35, v22, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v82, v24, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v97, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v99, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v101, v5, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v103, v7, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v55, v9, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v65, v11, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, v15, s16 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v112, v26, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v27, v113, v27, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[28:29], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v49, v17, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v39, v19, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v26, v80, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v27, v81, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v36, v23, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v83, v25, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v112, v26, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v113, v27, s22 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v114, v28, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v29, v115, v29, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[30:31], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v84, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v85, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v114, v28, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v115, v29, s23 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v116, v30, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v117, v31, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[66:67], 64 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v30, v86, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v31, v87, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v116, v30, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v117, v31, s24 -; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v66, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v67, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v20, v33, v20, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v34, v21, s19 +; GFX10-NEXT: v_max_f64 v[80:81], v[30:31], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v83, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v84, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v85, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v32, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v33, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v70, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v71, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v80, 0, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v81, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_maximum_v16f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_cmp_gt_f64_e64 s9, v[0:1], v[86:87] -; GFX11-NEXT: v_cmp_o_f64_e64 s11, v[0:1], v[86:87] -; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_cmp_gt_f64_e64 s10, v[2:3], v[84:85] -; GFX11-NEXT: v_cmp_class_f64_e64 s14, v[86:87], 64 -; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[4:5], v[32:33] -; GFX11-NEXT: v_cmp_o_f64_e32 vcc_lo, v[4:5], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_cmp_gt_f64_e64 s2, v[6:7], v[34:35] -; GFX11-NEXT: v_cmp_o_f64_e64 s12, v[2:3], v[84:85] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[6:7], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_cmp_gt_f64_e64 s4, v[8:9], v[36:37] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[8:9], v[36:37] -; GFX11-NEXT: v_cmp_class_f64_e64 s16, v[84:85], 64 -; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_cmp_gt_f64_e64 s6, v[10:11], v[38:39] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[10:11], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_cmp_gt_f64_e64 s8, v[12:13], v[48:49] -; GFX11-NEXT: v_cmp_o_f64_e64 s7, v[12:13], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_cmp_gt_f64_e64 s13, v[14:15], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_cmp_o_f64_e64 s15, v[16:17], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_cmp_gt_f64_e64 s17, v[18:19], v[54:55] -; GFX11-NEXT: v_cmp_o_f64_e64 s18, v[18:19], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_cmp_gt_f64_e64 s19, v[20:21], v[64:65] -; GFX11-NEXT: v_cmp_o_f64_e64 s20, v[20:21], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_cmp_gt_f64_e64 s21, v[22:23], v[66:67] -; GFX11-NEXT: v_cmp_o_f64_e64 s22, v[22:23], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_cmp_gt_f64_e64 s23, v[24:25], v[68:69] -; GFX11-NEXT: v_cmp_o_f64_e64 s24, v[24:25], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cmp_gt_f64_e64 s25, v[26:27], v[70:71] -; GFX11-NEXT: v_cmp_o_f64_e64 s26, v[26:27], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_gt_f64_e64 s27, v[28:29], v[80:81] -; GFX11-NEXT: v_cmp_o_f64_e64 s28, v[28:29], v[80:81] +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_max_f64 v[96:97], v[0:1], v[32:33] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_max_f64 v[32:33], v[2:3], v[34:35] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] +; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: v_max_f64 v[34:35], v[4:5], v[36:37] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_max_f64 v[36:37], v[6:7], v[38:39] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_max_f64 v[38:39], v[8:9], v[48:49] +; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_max_f64 v[48:49], v[10:11], v[50:51] +; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_max_f64 v[50:51], v[12:13], v[52:53] +; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_max_f64 v[52:53], v[14:15], v[54:55] +; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_max_f64 v[54:55], v[16:17], v[64:65] +; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_max_f64 v[64:65], v[18:19], v[66:67] +; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_max_f64 v[66:67], v[20:21], v[68:69] +; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_max_f64 v[68:69], v[22:23], v[70:71] +; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_max_f64 v[70:71], v[24:25], v[80:81] +; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_max_f64 v[80:81], v[26:27], v[82:83] +; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_max_f64 v[82:83], v[28:29], v[84:85] +; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_gt_f64_e64 s29, v[30:31], v[82:83] -; GFX11-NEXT: v_cmp_o_f64_e64 vcc_hi, v[30:31], v[82:83] -; GFX11-NEXT: v_cndmask_b32_e64 v96, v87, v1, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v101, v86, v0, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v98, v85, v3, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v103, v84, v2, s10 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[0:1], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v97, 0x7ff80000, v96, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v96, 0, v101, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v100, v33, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v102, v35, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v99, 0x7ff80000, v98, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v98, 0, v103, s12 -; GFX11-NEXT: v_cmp_class_f64_e64 s11, v[2:3], 64 -; GFX11-NEXT: v_cndmask_b32_e32 v101, 0x7ff80000, v100, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v103, 0x7ff80000, v102, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v112, v37, v9, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v114, v39, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v116, v49, v13, s8 -; GFX11-NEXT: v_cmp_o_f64_e64 s9, v[14:15], v[50:51] -; GFX11-NEXT: v_cndmask_b32_e64 v118, v51, v15, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v113, 0x7ff80000, v112, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v115, 0x7ff80000, v114, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v117, 0x7ff80000, v116, s7 -; GFX11-NEXT: v_cmp_gt_f64_e64 s12, v[16:17], v[52:53] -; GFX11-NEXT: v_cndmask_b32_e64 v130, v55, v19, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v132, v65, v21, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v134, v67, v23, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v144, v69, v25, s23 -; GFX11-NEXT: v_cndmask_b32_e64 v145, v71, v27, s25 -; GFX11-NEXT: v_cndmask_b32_e64 v131, 0x7ff80000, v130, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v133, 0x7ff80000, v132, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v135, 0x7ff80000, v134, s22 -; GFX11-NEXT: v_cndmask_b32_e64 v146, v81, v29, s27 -; GFX11-NEXT: v_cndmask_b32_e64 v148, v80, v28, s27 -; GFX11-NEXT: v_cndmask_b32_e64 v147, v83, v31, s29 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v147, 0x7ff80000, v147, vcc_hi -; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, v0, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, v1, s10 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[36:37], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v86, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v32, v4, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v87, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v34, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v98, v2, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v99, v3, s11 -; GFX11-NEXT: v_cndmask_b32_e32 v100, 0, v86, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[4:5], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v102, 0, v87, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v84, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v36, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v38, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v48, v12, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v119, 0x7ff80000, v118, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v128, v53, v17, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v112, 0, v84, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v114, 0, v86, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v116, 0, v87, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v50, v14, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v129, 0x7ff80000, v128, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v52, v16, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v54, v18, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v85, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v118, 0, v84, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v64, v20, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v128, 0, v86, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v130, 0, v87, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v66, v22, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v85, 0x7ff80000, v144, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v132, 0, v84, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v68, v24, s23 -; GFX11-NEXT: v_cndmask_b32_e64 v144, v70, v26, s25 -; GFX11-NEXT: v_cndmask_b32_e64 v134, 0, v86, s22 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[68:69], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[70:71], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v84, 0, v87, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v87, 0x7ff80000, v145, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v86, 0, v144, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v145, 0x7ff80000, v146, s28 -; GFX11-NEXT: v_cndmask_b32_e64 v144, 0, v148, s28 -; GFX11-NEXT: v_cndmask_b32_e64 v146, v82, v30, s29 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[80:81], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[82:83], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[32:33], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s8, v[34:35], 64 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v101, v5 :: v_dual_cndmask_b32 v4, v100, v4 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[6:7], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v146, 0, v146, vcc_hi -; GFX11-NEXT: v_cmp_class_f64_e64 s12, v[38:39], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s14, v[48:49], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s16, v[50:51], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s18, v[52:53], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s20, v[54:55], 64 -; GFX11-NEXT: v_cmp_class_f64_e64 s21, v[64:65], 64 -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[96:97] -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[98:99] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[100:101] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[102:103] -; GFX11-NEXT: v_cmp_eq_f64_e64 s11, 0, v[112:113] -; GFX11-NEXT: v_cmp_eq_f64_e64 s13, 0, v[114:115] -; GFX11-NEXT: v_cmp_eq_f64_e64 s15, 0, v[116:117] -; GFX11-NEXT: v_cmp_eq_f64_e64 s17, 0, v[118:119] -; GFX11-NEXT: v_cmp_eq_f64_e64 s19, 0, v[128:129] -; GFX11-NEXT: v_cmp_eq_f64_e64 s22, 0, v[130:131] -; GFX11-NEXT: v_cmp_eq_f64_e64 s23, 0, v[132:133] -; GFX11-NEXT: v_cmp_eq_f64_e64 s24, 0, v[134:135] -; GFX11-NEXT: v_cmp_eq_f64_e64 s25, 0, v[84:85] -; GFX11-NEXT: v_cmp_eq_f64_e64 s26, 0, v[86:87] -; GFX11-NEXT: v_cmp_eq_f64_e64 s27, 0, v[144:145] -; GFX11-NEXT: v_cmp_eq_f64_e64 s28, 0, v[146:147] -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v33, s6 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v103, v7 :: v_dual_cndmask_b32 v6, v102, v6 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[8:9], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v32, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v35, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, v0, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v98, v2, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v100, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, v1, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v99, v3, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v101, v5, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v103, v7, s9 -; GFX11-NEXT: v_dual_cndmask_b32 v9, v113, v9 :: v_dual_cndmask_b32 v8, v112, v8 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[10:11], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v34, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v37, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v102, v6, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v113, v9, s11 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v115, v11 :: v_dual_cndmask_b32 v10, v114, v10 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[12:13], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v36, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v39, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v112, v8, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v115, v11, s13 -; GFX11-NEXT: v_dual_cndmask_b32 v13, v117, v13 :: v_dual_cndmask_b32 v12, v116, v12 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[14:15], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v38, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v49, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v114, v10, s13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v117, v13, s15 -; GFX11-NEXT: v_dual_cndmask_b32 v15, v119, v15 :: v_dual_cndmask_b32 v14, v118, v14 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[16:17], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v48, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v51, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v12, v116, v12, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v119, v15, s17 -; GFX11-NEXT: v_dual_cndmask_b32 v17, v129, v17 :: v_dual_cndmask_b32 v16, v128, v16 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[18:19], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v50, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v53, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v118, v14, s17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v17, v129, v17, s19 -; GFX11-NEXT: v_dual_cndmask_b32 v19, v131, v19 :: v_dual_cndmask_b32 v18, v130, v18 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[20:21], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v52, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v19, v55, s20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v16, v128, v16, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v131, v19, s22 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v133, v21 :: v_dual_cndmask_b32 v20, v132, v20 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v54, s20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v65, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v130, v18, s22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v21, v133, v21, s23 -; GFX11-NEXT: v_dual_cndmask_b32 v23, v135, v23 :: v_dual_cndmask_b32 v22, v134, v22 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[24:25], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v20, v64, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v132, v20, s23 -; GFX11-NEXT: v_dual_cndmask_b32 v25, v85, v25 :: v_dual_cndmask_b32 v24, v84, v24 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[26:27], 64 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v25, v25, v69, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v25, v85, v25, s25 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v87, v27 :: v_dual_cndmask_b32 v26, v86, v26 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[28:29], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v24, v68, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v27, v27, v71, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v84, v24, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v27, v87, v27, s26 -; GFX11-NEXT: v_dual_cndmask_b32 v29, v145, v29 :: v_dual_cndmask_b32 v28, v144, v28 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[30:31], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v26, v26, v70, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v29, v29, v81, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v26, v86, v26, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v29, v145, v29, s27 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v147, v31 :: v_dual_cndmask_b32 v30, v146, v30 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[66:67], 64 -; GFX11-NEXT: v_cndmask_b32_e64 v28, v28, v80, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v31, v31, v83, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v28, v144, v28, s27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v31, v147, v31, s28 -; GFX11-NEXT: v_dual_cndmask_b32 v23, v23, v67 :: v_dual_cndmask_b32 v22, v22, v66 -; GFX11-NEXT: v_cndmask_b32_e64 v30, v30, v82, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v23, v135, v23, s24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v22, v134, v22, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v30, v146, v30, s28 +; GFX11-NEXT: v_max_f64 v[84:85], v[30:31], v[86:87] +; GFX11-NEXT: v_cmp_u_f64_e64 s14, v[30:31], v[86:87] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v32, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v33, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v34, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v35, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v36, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v37, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v38, 0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v39, 0x7ff80000, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v48, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v49, 0x7ff80000, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v50, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v51, 0x7ff80000, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v52, 0, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v53, 0x7ff80000, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v54, 0, s7 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v55, 0x7ff80000, s7 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v64, 0, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v19, v65, 0x7ff80000, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v20, v66, 0, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v21, v67, 0x7ff80000, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v22, v68, 0, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v23, v69, 0x7ff80000, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v24, v70, 0, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v25, v71, 0x7ff80000, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v26, v80, 0, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v27, v81, 0x7ff80000, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v28, v82, 0, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v29, v83, 0x7ff80000, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v30, v84, 0, s14 +; GFX11-NEXT: v_cndmask_b32_e64 v31, v85, 0x7ff80000, s14 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index 95d351e..e00ebff 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -443,28 +443,14 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc +; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -543,26 +529,9 @@ define <2 x half> @v_minimum_v2f16__nnan(<2 x half> %src0, <2 x half> %src1) { ; GFX8-LABEL: v_minimum_v2f16__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f16__nnan: @@ -608,13 +577,11 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v3, vcc +; GFX8-NEXT: v_min_f16_e32 v4, v3, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v3, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v1, v0, vcc +; GFX8-NEXT: v_min_f16_e32 v3, v0, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, v0, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc @@ -696,14 +663,9 @@ define <2 x half> @v_minimum_v2f16__nnan_nsz(<2 x half> %src0, <2 x half> %src1) ; GFX8-LABEL: v_minimum_v2f16__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v3, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, v0, v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_min_f16_sdwa v2, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f16__nnan_nsz: @@ -750,31 +712,15 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX8-NEXT: s_lshr_b32 s6, s5, 16 ; GFX8-NEXT: s_lshr_b32 s7, s4, 16 ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, s7, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0x7e00 +; GFX8-NEXT: v_min_f16_e32 v1, s7, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7e00 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s7, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_lt_f16_e32 vcc, s4, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v1, v2, vcc +; GFX8-NEXT: v_min_f16_e32 v3, s4, v1 ; GFX8-NEXT: v_cmp_o_f16_e32 vcc, s4, v1 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f16_e64 vcc, s5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_eq_f16_e32 vcc, 0, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll index 1da2647..e0566820 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll @@ -495,167 +495,73 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) { ; GFX7-LABEL: v_minimum_v2f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v1, v3 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX7-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX8-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v5, v1, v3 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v4, v0, v2 :: v_dual_min_f32 v5, v1, v3 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f32: @@ -676,136 +582,42 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1) ; GFX7-LABEL: v_minimum_v2f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v0, v2 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v1, v3 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f32__nnan: @@ -826,11 +638,11 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX7-LABEL: v_minimum_v2f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX7-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v1, v3 +; GFX7-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -838,13 +650,11 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX8-LABEL: v_minimum_v2f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX8-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -852,13 +662,11 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX9-LABEL: v_minimum_v2f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v2, v1, v3 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -866,16 +674,12 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX940-LABEL: v_minimum_v2f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX940-NEXT: v_mov_b32_e32 v5, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_min_f32_e32 v2, v1, v3 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v3 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc @@ -884,11 +688,9 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX10-LABEL: v_minimum_v2f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v4, v0, v2 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v5, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo @@ -897,12 +699,9 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) { ; GFX11-LABEL: v_minimum_v2f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v4, v0, v2 :: v_dual_min_f32 v5, v1, v3 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v3 ; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v5, vcc_lo @@ -926,55 +725,42 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr ; GFX7-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v0, v2 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v3 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v2 :: v_dual_min_f32 v1, v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f32__nnan_nsz: @@ -996,28 +782,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s7 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, s5, v0 +; GFX7-NEXT: v_min_f32_e32 v1, s5, v0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, s5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s7, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v1 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc ; GFX7-NEXT: v_mov_b32_e32 v0, s6 -; GFX7-NEXT: v_min_legacy_f32_e32 v3, s4, v0 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX7-NEXT: v_min_f32_e32 v3, s4, v0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc -; GFX7-NEXT: v_mov_b32_e32 v3, s4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v2, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, s6, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX7-NEXT: ;;#ASMSTART ; GFX7-NEXT: ; use v[0:1] ; GFX7-NEXT: ;;#ASMEND @@ -1027,30 +799,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s7 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX8-NEXT: v_min_f32_e32 v1, s5, v0 +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v3, s4, v0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, s6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: ;;#ASMSTART ; GFX8-NEXT: ; use v[0:1] ; GFX8-NEXT: ;;#ASMEND @@ -1060,30 +816,14 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s7 -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0x7fc00000 +; GFX9-NEXT: v_min_f32_e32 v1, s5, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s5, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v3, s4, v0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, s4, v0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, s6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; use v[0:1] ; GFX9-NEXT: ;;#ASMEND @@ -1093,40 +833,15 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b32_e32 v0, s3 -; GFX940-NEXT: v_mov_b32_e32 v1, s1 -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 -; GFX940-NEXT: v_mov_b32_e32 v3, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v1, s1, v0 +; GFX940-NEXT: v_mov_b32_e32 v2, 0x7fc00000 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s1, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v0, vcc ; GFX940-NEXT: v_mov_b32_e32 v0, s2 -; GFX940-NEXT: v_mov_b32_e32 v2, s0 -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v0, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v3, s0, v0 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, s0, v0 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, s2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; use v[0:1] ; GFX940-NEXT: ;;#ASMEND @@ -1135,28 +850,12 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX10-LABEL: s_minimum_v2f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s5 -; GFX10-NEXT: v_cmp_lt_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_mov_b32_e32 v1, s4 -; GFX10-NEXT: v_cmp_class_f32_e64 s8, s5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, s7, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e64 vcc_lo, s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, s6, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e64 v0, s5, s7 ; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s5, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX10-NEXT: v_min_f32_e64 v2, s4, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e64 vcc_lo, s4, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v1, v0, s5, s8 -; GFX10-NEXT: v_cmp_class_f32_e64 s5, s4, 32 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v2, s4, s5 -; GFX10-NEXT: v_cmp_class_f32_e64 s4, s7, 32 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s7, s4 -; GFX10-NEXT: v_cmp_class_f32_e64 s4, s6, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s6, s4 -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v[0:1] ; GFX10-NEXT: ;;#ASMEND @@ -1165,32 +864,13 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) { ; GFX11-LABEL: s_minimum_v2f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 -; GFX11-NEXT: v_cmp_lt_f32_e64 vcc_lo, s1, s3 -; GFX11-NEXT: v_cmp_class_f32_e64 s4, s1, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, s3, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e64 vcc_lo, s0, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, s2, v1, vcc_lo +; GFX11-NEXT: v_min_f32_e64 v0, s1, s3 ; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s1, s3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo +; GFX11-NEXT: v_min_f32_e64 v2, s0, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v0, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e64 vcc_lo, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v0, s1, s4 -; GFX11-NEXT: v_cmp_class_f32_e64 s1, s0, 32 -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v2, s0, s1 -; GFX11-NEXT: v_cmp_class_f32_e64 s0, s3, 32 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX11-NEXT: v_cmp_class_f32_e64 s0, s2, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v2, vcc_lo ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v[0:1] ; GFX11-NEXT: ;;#ASMEND @@ -1218,227 +898,92 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) { ; GFX7-LABEL: v_minimum_v3f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v1, v4 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX7-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v2, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX7-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX8-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX8-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 +; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc +; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v7, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v8, v2, v5 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v6, v0, v3 :: v_dual_min_f32 v7, v1, v4 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v6, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v7, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v8, v2, v5 :: v_dual_cndmask_b32 v1, 0x7fc00000, v7 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f32: @@ -1460,184 +1005,48 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1) ; GFX7-LABEL: v_minimum_v3f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v0, v3 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v1, v4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v2, v5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f32__nnan: @@ -1659,14 +1068,14 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX7-LABEL: v_minimum_v3f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v6, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX7-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v1, v4 +; GFX7-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v2, v5 +; GFX7-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -1674,17 +1083,14 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX8-LABEL: v_minimum_v3f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX8-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX8-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -1692,17 +1098,14 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX9-LABEL: v_minimum_v3f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX9-NEXT: v_mov_b32_e32 v7, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v1, v4 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -1710,22 +1113,16 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX940-LABEL: v_minimum_v3f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 +; GFX940-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX940-NEXT: v_mov_b32_e32 v7, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_min_f32_e32 v3, v1, v4 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v4 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v3, v2, v5 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v5 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc @@ -1734,13 +1131,10 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX10-LABEL: v_minimum_v3f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v6, v0, v3 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 +; GFX10-NEXT: v_min_f32_e32 v7, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v8, v2, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo @@ -1751,17 +1145,14 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) { ; GFX11-LABEL: v_minimum_v3f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v6, v0, v3 :: v_dual_min_f32 v7, v1, v4 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v6, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v7, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v8, v2, v5 :: v_dual_cndmask_b32 v1, 0x7fc00000, v7 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1784,67 +1175,48 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr ; GFX7-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v0, v3 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v1, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v2, v5 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v3 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v3 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v4 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v3 :: v_dual_min_f32 v1, v1, v4 +; GFX11-NEXT: v_min_f32_e32 v2, v2, v5 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f32__nnan_nsz: @@ -1866,292 +1238,111 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) { ; GFX7-LABEL: v_minimum_v4f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v1, v5 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v2, v6 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX7-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v3, v7 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX7-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 +; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc +; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc +; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v9, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v8, v0, v4 :: v_dual_min_f32 v9, v1, v5 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX11-NEXT: v_dual_min_f32 v8, v3, v7 :: v_dual_cndmask_b32 v1, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f32: @@ -2174,236 +1365,53 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1) ; GFX7-LABEL: v_minimum_v4f32__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v0, v4 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v1, v5 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v2, v6 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v3, v7 -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f32__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f32__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f32__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v7, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f32__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v7, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f32__nnan: @@ -2426,17 +1434,17 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX7-LABEL: v_minimum_v4f32__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v0, v4 +; GFX7-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX7-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v1, v5 +; GFX7-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v2, v6 +; GFX7-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX7-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v4, v3, v7 +; GFX7-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] @@ -2444,21 +1452,17 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX8-LABEL: v_minimum_v4f32__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX8-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX8-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -2466,21 +1470,17 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX9-LABEL: v_minimum_v4f32__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX9-NEXT: v_mov_b32_e32 v9, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v1, v5 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -2488,28 +1488,20 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX940-LABEL: v_minimum_v4f32__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 +; GFX940-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX940-NEXT: v_mov_b32_e32 v9, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_min_f32_e32 v4, v1, v5 +; GFX940-NEXT: s_nop 0 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v8, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v5 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v4, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v6 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v4, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc +; GFX940-NEXT: v_min_f32_e32 v4, v3, v7 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v7 ; GFX940-NEXT: s_nop 1 ; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc @@ -2518,44 +1510,35 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) { ; GFX10-LABEL: v_minimum_v4f32__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v0, v4 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v9, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v4, v2, v6 ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v7, v3, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v3, v7 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f32__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v8, v0, v4 :: v_dual_min_f32 v9, v1, v5 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v8, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v7, v3, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v4, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v4, v2, v6 +; GFX11-NEXT: v_dual_min_f32 v8, v3, v7 :: v_dual_cndmask_b32 v1, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v4, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f32__nsz: @@ -2578,79 +1561,53 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr ; GFX7-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v0, v0, v4 -; GFX7-NEXT: v_min_legacy_f32_e32 v1, v1, v5 -; GFX7-NEXT: v_min_legacy_f32_e32 v2, v2, v6 -; GFX7-NEXT: v_min_legacy_f32_e32 v3, v3, v7 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX7-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX8-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX8-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX9-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v4 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v5 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v6 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v7 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX940-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX940-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX940-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v0, v0, v4 +; GFX10-NEXT: v_min_f32_e32 v1, v1, v5 +; GFX10-NEXT: v_min_f32_e32 v2, v2, v6 +; GFX10-NEXT: v_min_f32_e32 v3, v3, v7 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f32__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v0, v0, v4 :: v_dual_min_f32 v1, v1, v5 +; GFX11-NEXT: v_dual_min_f32 v2, v2, v6 :: v_dual_min_f32 v3, v3, v7 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f32__nnan_nsz: @@ -2673,551 +1630,185 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) { ; GFX7-LABEL: v_minimum_v8f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v0, v8 +; GFX7-NEXT: v_min_f32_e32 v16, v0, v8 ; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v1, v9 +; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v1, v9 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v2, v10 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v2, v10 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v3, v11 +; GFX7-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v3, v11 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v4, v12 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v4, v12 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v5, v13 +; GFX7-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v5, v13 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v6, v14 +; GFX7-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v6, v14 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v8, v7, v15 +; GFX7-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX7-NEXT: v_min_f32_e32 v8, v7, v15 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v8f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc +; GFX8-NEXT: v_min_f32_e32 v16, v0, v8 ; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v1, v9 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v2, v10 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v3, v11 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v4, v12 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v5, v13 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v6, v14 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX8-NEXT: v_min_f32_e32 v8, v7, v15 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v8f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc +; GFX9-NEXT: v_min_f32_e32 v16, v0, v8 ; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v1, v9 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v2, v10 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v3, v11 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v4, v12 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v5, v13 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v6, v14 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX9-NEXT: v_min_f32_e32 v8, v7, v15 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v8f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v8 +; GFX940-NEXT: v_min_f32_e32 v16, v0, v8 ; GFX940-NEXT: v_mov_b32_e32 v17, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v9 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v9, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v1, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v17, v16, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v9 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v10 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v2, v10 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v10 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v11 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v2, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v3, v11 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v11 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v4, v12 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v4, v12 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v12 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v5, v13 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v4, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v5, v13 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v13 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v6, v14 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v6, v14 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v14 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v7, v15 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v6, v17, v8, vcc +; GFX940-NEXT: v_min_f32_e32 v8, v7, v15 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v15 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v8 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v16, v0, v8 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v17, v1, v9 +; GFX10-NEXT: v_min_f32_e32 v8, v2, v10 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v16, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v12, v4, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v9, v3, v11 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v17, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v10, v7, v15 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v11 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v4, v12 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v12 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v9, v5, v13 +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v13 -; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v8, v6, v14 +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v9, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v14 -; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v8, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v10, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v8f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v16, v0, v8 :: v_dual_min_f32 v17, v1, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v16, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7fc00000, v17, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v12, v4, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v9, v3, v11 :: v_dual_min_f32 v8, v2, v10 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v17, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v10, v7, v15 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v8, v4, v12 :: v_dual_cndmask_b32 v3, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v9, v5, v13 :: v_dual_cndmask_b32 v4, 0x7fc00000, v8 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v8, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_dual_min_f32 v8, v6, v14 :: v_dual_cndmask_b32 v5, 0x7fc00000, v9 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v9, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v8, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v13, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v10, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v8f32: @@ -3244,1071 +1835,371 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) { ; GFX7-LABEL: v_minimum_v16f32: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v32, v0, v16 -; GFX7-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v16, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v1, v17 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] +; GFX7-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX7-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX7-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX7-NEXT: v_writelane_b32 v31, s30, 0 ; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v2, v18 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v18, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v3, v19 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v19, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v4, v20 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v20, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v5, v21 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v21, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v6, v22 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v22, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v7, v23 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v23, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v8, v24 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v24, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v9, v25 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v25, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v10, v26 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v26, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v11, v27 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v27, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v12, v28 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v28, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v13, v29 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v29, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v14, v30 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v30, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX7-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX7-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX7-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX7-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX7-NEXT: v_min_f32_e32 v18, v13, v29 +; GFX7-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX7-NEXT: v_writelane_b32 v31, s31, 1 +; GFX7-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX7-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX7-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX7-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX7-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX7-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX7-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX7-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX7-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX7-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX7-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX7-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX7-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX7-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX7-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX7-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX7-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX7-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX7-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX7-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX7-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX7-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX7-NEXT: v_readlane_b32 s31, v31, 1 +; GFX7-NEXT: v_readlane_b32 s30, v31, 0 +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_min_f32_e32 v18, v15, v16 +; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX7-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_min_legacy_f32_e32 v16, v15, v17 -; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX7-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX7-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc -; GFX8-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v16, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] +; GFX8-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX8-NEXT: v_writelane_b32 v31, s30, 0 ; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v2, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v18, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v3, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v19, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v4, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v20, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v5, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v21, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v6, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v22, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v7, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v23, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v8, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v24, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v9, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v25, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v10, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v26, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v11, v27 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v27, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v12, v28 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v28, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v13, v29 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v29, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v30, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX8-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX8-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX8-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX8-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX8-NEXT: v_min_f32_e32 v18, v13, v29 +; GFX8-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX8-NEXT: v_writelane_b32 v31, s31, 1 +; GFX8-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX8-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX8-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX8-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX8-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX8-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX8-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX8-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX8-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX8-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX8-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX8-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX8-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX8-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX8-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX8-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX8-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX8-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX8-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX8-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX8-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX8-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX8-NEXT: v_readlane_b32 s31, v31, 1 +; GFX8-NEXT: v_readlane_b32 s30, v31, 0 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_min_f32_e32 v18, v15, v16 +; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_lt_f32_e32 vcc, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX8-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX8-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX8-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v16f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc -; GFX9-NEXT: v_mov_b32_e32 v31, 0x7fc00000 -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v32, v31, v32, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v16, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: v_cmp_o_f32_e64 s[16:17], v0, v16 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v16 +; GFX9-NEXT: buffer_load_dword v16, off, s[0:3], s32 +; GFX9-NEXT: v_writelane_b32 v31, s30, 0 ; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX9-NEXT: buffer_load_dword v17, off, s[0:3], s32 -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v18, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v19, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v20, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v21, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v22, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v23, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v24, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v25, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v26, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v27, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v12, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v28, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v13, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v29, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v30, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc +; GFX9-NEXT: v_min_f32_e32 v1, v1, v17 +; GFX9-NEXT: v_cmp_o_f32_e64 s[4:5], v2, v18 +; GFX9-NEXT: v_min_f32_e32 v2, v2, v18 +; GFX9-NEXT: v_mov_b32_e32 v17, 0x7fc00000 +; GFX9-NEXT: v_min_f32_e32 v18, v13, v29 +; GFX9-NEXT: v_cmp_o_f32_e64 s[28:29], v13, v29 +; GFX9-NEXT: v_writelane_b32 v31, s31, 1 +; GFX9-NEXT: v_cmp_o_f32_e64 s[6:7], v3, v19 +; GFX9-NEXT: v_min_f32_e32 v3, v3, v19 +; GFX9-NEXT: v_cmp_o_f32_e64 s[8:9], v4, v20 +; GFX9-NEXT: v_min_f32_e32 v4, v4, v20 +; GFX9-NEXT: v_cmp_o_f32_e64 s[10:11], v5, v21 +; GFX9-NEXT: v_min_f32_e32 v5, v5, v21 +; GFX9-NEXT: v_cmp_o_f32_e64 s[12:13], v6, v22 +; GFX9-NEXT: v_min_f32_e32 v6, v6, v22 +; GFX9-NEXT: v_cmp_o_f32_e64 s[14:15], v7, v23 +; GFX9-NEXT: v_min_f32_e32 v7, v7, v23 +; GFX9-NEXT: v_cmp_o_f32_e64 s[18:19], v8, v24 +; GFX9-NEXT: v_min_f32_e32 v8, v8, v24 +; GFX9-NEXT: v_cmp_o_f32_e64 s[20:21], v9, v25 +; GFX9-NEXT: v_min_f32_e32 v9, v9, v25 +; GFX9-NEXT: v_cmp_o_f32_e64 s[22:23], v10, v26 +; GFX9-NEXT: v_min_f32_e32 v10, v10, v26 +; GFX9-NEXT: v_cmp_o_f32_e64 s[24:25], v11, v27 +; GFX9-NEXT: v_min_f32_e32 v11, v11, v27 +; GFX9-NEXT: v_cmp_o_f32_e64 s[26:27], v12, v28 +; GFX9-NEXT: v_min_f32_e32 v12, v12, v28 +; GFX9-NEXT: v_min_f32_e32 v19, v14, v30 +; GFX9-NEXT: v_cmp_o_f32_e64 s[30:31], v14, v30 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v18, s[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v17, v0, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v17, v2, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v17, v4, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v17, v6, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v17, v8, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v17, v10, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v17, v12, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v17, v19, s[30:31] +; GFX9-NEXT: v_readlane_b32 s31, v31, 1 +; GFX9-NEXT: v_readlane_b32 s30, v31, 0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_min_f32_e32 v18, v15, v16 +; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v15, v17, v18, vcc +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_f32_e32 vcc, v15, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v17, v15, vcc -; GFX9-NEXT: v_cmp_o_f32_e32 vcc, v15, v17 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v31, v16, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX9-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v15, v17, vcc -; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX9-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v16f32: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v0, v16 ; GFX940-NEXT: v_mov_b32_e32 v32, 0x7fc00000 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v33, v16, v0, vcc +; GFX940-NEXT: v_min_f32_e32 v33, v0, v16 ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v33, v32, v33, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v0, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v16, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v33 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v1, v17 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v17, v1, vcc +; GFX940-NEXT: v_min_f32_e32 v34, v1, v17 +; GFX940-NEXT: v_min_f32_e32 v35, v2, v18 +; GFX940-NEXT: v_cndmask_b32_e32 v0, v32, v33, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v1, v17 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v1, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v17, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v2, v18 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc +; GFX940-NEXT: v_min_f32_e32 v36, v3, v19 +; GFX940-NEXT: v_min_f32_e32 v37, v4, v20 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v32, v34, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v2, v18 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v2, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v18, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v3, v19 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc +; GFX940-NEXT: v_min_f32_e32 v38, v5, v21 +; GFX940-NEXT: v_min_f32_e32 v39, v6, v22 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v32, v35, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v3, v19 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v3, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v19, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v4, v20 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc +; GFX940-NEXT: v_min_f32_e32 v48, v7, v23 +; GFX940-NEXT: v_min_f32_e32 v49, v8, v24 +; GFX940-NEXT: v_cndmask_b32_e32 v3, v32, v36, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v4, v20 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v4, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v20, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v5, v21 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc +; GFX940-NEXT: v_min_f32_e32 v50, v9, v25 +; GFX940-NEXT: v_min_f32_e32 v51, v10, v26 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v32, v37, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v5, v21 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v5, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v21, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v6, v22 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc +; GFX940-NEXT: v_min_f32_e32 v52, v11, v27 +; GFX940-NEXT: v_min_f32_e32 v53, v12, v28 +; GFX940-NEXT: v_cndmask_b32_e32 v5, v32, v38, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v6, v22 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v6, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v22, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc +; GFX940-NEXT: v_min_f32_e32 v54, v13, v29 +; GFX940-NEXT: v_min_f32_e32 v55, v14, v30 +; GFX940-NEXT: v_cndmask_b32_e32 v6, v32, v39, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v7, v23 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v7, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v23, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v7, v16, v7, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v8, v24 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc +; GFX940-NEXT: s_waitcnt vmcnt(0) +; GFX940-NEXT: v_min_f32_e32 v16, v15, v31 +; GFX940-NEXT: v_cndmask_b32_e32 v7, v32, v48, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v8, v24 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v8, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v24, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v9, v25 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v8, v32, v49, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v9, v25 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v9, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v25, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v16, v9, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v10, v26 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v32, v50, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v10, v26 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v10, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v26, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v11, v27 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v10, v32, v51, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v11, v27 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v11, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v27, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v11, v16, v11, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v12, v28 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v32, v52, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v12, v28 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v12, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v28, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v13, v29 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v12, v32, v53, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v13, v29 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v13, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v29, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v16, v13, vcc -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v14, v30 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v32, v54, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v14, v30 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v14, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v30, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_lt_f32_e32 vcc, v15, v31 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v14, v32, v55, vcc ; GFX940-NEXT: v_cmp_o_f32_e32 vcc, v15, v31 ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v32, v16, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v15, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc -; GFX940-NEXT: v_cmp_class_f32_e64 vcc, v31, 32 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc -; GFX940-NEXT: v_cmp_eq_f32_e32 vcc, 0, v16 -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v32, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v16 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v34, v18, v2, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v35, v19, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, v20, v4, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v37, v21, v5, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v38, v22, v6, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v23 -; GFX10-NEXT: v_cndmask_b32_e32 v39, v23, v7, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v48, v24, v8, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v25 -; GFX10-NEXT: v_cndmask_b32_e32 v49, v25, v9, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v50, v26, v10, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v51, v27, v11, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v52, v28, v12, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v53, v29, v13, vcc_lo -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v30 -; GFX10-NEXT: v_cndmask_b32_e32 v54, v30, v14, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v32, v0, v16 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v33, v1, v17 +; GFX10-NEXT: v_min_f32_e32 v34, v2, v18 +; GFX10-NEXT: v_min_f32_e32 v35, v3, v19 +; GFX10-NEXT: v_min_f32_e32 v36, v4, v20 +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v32, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v37, v5, v21 +; GFX10-NEXT: v_min_f32_e32 v38, v6, v22 +; GFX10-NEXT: v_min_f32_e32 v39, v7, v23 +; GFX10-NEXT: v_min_f32_e32 v48, v8, v24 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v33, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v49, v9, v25 +; GFX10-NEXT: v_min_f32_e32 v50, v10, v26 +; GFX10-NEXT: v_min_f32_e32 v51, v11, v27 +; GFX10-NEXT: v_min_f32_e32 v52, v12, v28 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v34, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v19 -; GFX10-NEXT: v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v53, v13, v29 +; GFX10-NEXT: v_min_f32_e32 v54, v14, v30 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v35, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v36, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v21 -; GFX10-NEXT: v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v37, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v22 -; GFX10-NEXT: v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v38, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v23 -; GFX10-NEXT: v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v39, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v24 -; GFX10-NEXT: v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v48, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v25 -; GFX10-NEXT: v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v49, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v26 -; GFX10-NEXT: v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v50, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v27 -; GFX10-NEXT: v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v11, 0x7fc00000, v51, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v28 -; GFX10-NEXT: v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0x7fc00000, v52, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v29 -; GFX10-NEXT: v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0x7fc00000, v53, vcc_lo ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v30 -; GFX10-NEXT: v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v16, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v17, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v18, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v19, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v20, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v21, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v22, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v23, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v24, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v25, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v26, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v27, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v28, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v29, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v30, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34 -; GFX10-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 -; GFX10-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX10-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v14, 0x7fc00000, v54, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo +; GFX10-NEXT: v_min_f32_e32 v16, v15, v31 ; GFX10-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v31 -; GFX10-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX10-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX10-NEXT: v_cmp_class_f32_e64 vcc_lo, v31, 32 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX10-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v15, 0x7fc00000, v16, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v16f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v16 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v18, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v19, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v4, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v20, v4, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v5, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v21, v5, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v38, v22, v6, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v7, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v23, v7, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v8, v24 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v24, v8, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v9, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v49, v25, v9, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v10, v26 -; GFX11-NEXT: v_cndmask_b32_e32 v50, v26, v10, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v11, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v51, v27, v11, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v12, v28 -; GFX11-NEXT: v_cndmask_b32_e32 v52, v28, v12, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v13, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v53, v29, v13, vcc_lo -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v14, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v54, v30, v14, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v32, v0, v16 :: v_dual_min_f32 v33, v1, v17 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v32, 0x7fc00000, v32, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v34, v2, v18 :: v_dual_min_f32 v35, v3, v19 +; GFX11-NEXT: v_dual_min_f32 v36, v4, v20 :: v_dual_min_f32 v37, v5, v21 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7fc00000, v32, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v1, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v33, 0x7fc00000, v33, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v54, v14, v30 +; GFX11-NEXT: v_dual_min_f32 v38, v6, v22 :: v_dual_min_f32 v39, v7, v23 +; GFX11-NEXT: v_dual_min_f32 v48, v8, v24 :: v_dual_min_f32 v49, v9, v25 +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7fc00000, v33, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v2, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v34, 0x7fc00000, v34, vcc_lo +; GFX11-NEXT: v_dual_min_f32 v50, v10, v26 :: v_dual_min_f32 v51, v11, v27 +; GFX11-NEXT: v_dual_min_f32 v52, v12, v28 :: v_dual_min_f32 v53, v13, v29 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7fc00000, v34, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v3, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v35, 0x7fc00000, v35, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7fc00000, v35, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v4, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v36, 0x7fc00000, v36, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7fc00000, v36, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v5, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v37, 0x7fc00000, v37, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7fc00000, v37, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v38, 0x7fc00000, v38, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7fc00000, v38, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v7, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v39, 0x7fc00000, v39, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7fc00000, v39, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v8, v24 -; GFX11-NEXT: v_cndmask_b32_e32 v48, 0x7fc00000, v48, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7fc00000, v48, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v9, v25 -; GFX11-NEXT: v_cndmask_b32_e32 v49, 0x7fc00000, v49, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7fc00000, v49, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v10, v26 -; GFX11-NEXT: v_cndmask_b32_e32 v50, 0x7fc00000, v50, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7fc00000, v50, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v11, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v51, 0x7fc00000, v51, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7fc00000, v51, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v12, v28 -; GFX11-NEXT: v_cndmask_b32_e32 v52, 0x7fc00000, v52, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v12, 0x7fc00000, v52, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v13, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v53, 0x7fc00000, v53, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7fc00000, v53, vcc_lo ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v14, v30 -; GFX11-NEXT: v_cndmask_b32_e32 v54, 0x7fc00000, v54, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v0, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v1, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v2, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v3, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v4, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v5, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v6, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v7, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v8, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v9, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v10, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v11, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v12, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v13, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v14, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v16, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v17, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v18, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v19, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v20, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v20, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v21, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v21, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v22, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v22, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v23, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v7, v23, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v24, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v8, v24, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v25, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v9, v25, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v26, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v10, v26, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v27, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v11, v27, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v28, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v12, v28, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v29, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v13, v29, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v30, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v14, v30, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v32 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v34 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v34, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v35 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v35, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v36, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v37, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v38, v6, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7fc00000, v54, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v15, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v39 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v39, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v48, v8, vcc_lo +; GFX11-NEXT: v_min_f32_e32 v16, v15, v31 ; GFX11-NEXT: v_cmp_o_f32_e32 vcc_lo, v15, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7fc00000, v16, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v49 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v49, v9, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v50 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v15, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v51 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v52 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v52, v12, vcc_lo -; GFX11-NEXT: v_cmp_class_f32_e64 vcc_lo, v31, 32 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v15, v31, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v53 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v53, v13, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v54 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7fc00000, v16, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v16f32: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll index 7013c60..37fe2e9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f64.ll @@ -530,221 +530,86 @@ define <2 x double> @v_minimum_v2f64(<2 x double> %src0, <2 x double> %src1) { ; GFX7-LABEL: v_minimum_v2f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX7-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX7-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX8-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX8-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX9-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[0:1], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v7, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_mov_b32_e32 v10, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v10, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[4:5], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v10, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v4, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[4:5] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v13, v6, v2, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v9, 0x7ff80000, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v10, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v8, 0, v12, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v10, 0, v13, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s7, 0, v[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s8, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s8 +; GFX10-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[6:7] -; GFX11-NEXT: v_cndmask_b32_e32 v8, v5, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v10, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v6, v2, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v9, 0x7ff80000, v8, s1 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v11, 0x7ff80000, v10, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v10, 0, v13, s2 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[6:7], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[10:11] -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v8, 0, v12, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_eq_f64_e64 s3, 0, v[8:9] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v0, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v1, s3 +; GFX11-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[6:7] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f64: @@ -765,182 +630,43 @@ define <2 x double> @v_minimum_v2f64__nnan(<2 x double> %src0, <2 x double> %src ; GFX7-LABEL: v_minimum_v2f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[6:7], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v7, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v6, v2, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[12:13] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[4:5], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 32 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v9, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[4:5] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[2:3] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 32 -; GFX10-NEXT: v_cndmask_b32_e32 v9, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v11, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v10, v6, v2, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s7, 0, v[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s8, 0, v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, v0, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, v1, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s8 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[6:7], 32 -; GFX11-NEXT: v_dual_cndmask_b32 v9, v5, v1 :: v_dual_cndmask_b32 v8, v4, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v11, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v6, v2, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s3, 0, v[8:9] -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v6, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, v0, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, v1, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f64__nnan: @@ -961,111 +687,86 @@ define <2 x double> @v_minimum_v2f64__nsz(<2 x double> %src0, <2 x double> %src1 ; GFX7-LABEL: v_minimum_v2f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX7-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX7-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX7-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX8-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX8-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v5, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[8:9] +; GFX9-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[6:7] +; GFX9-NEXT: v_mov_b32_e32 v3, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v3, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[4:5] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v4, 0x7ff80000 -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v4, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[4:5] +; GFX940-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v4, v3, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[6:7] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v5, v8, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v8, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v9, v6, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s6 +; GFX10-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s4 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[6:7] -; GFX11-NEXT: v_dual_cndmask_b32 v8, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_min_f64 v[8:9], v[0:1], v[4:5] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[4:5], v[2:3], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v6, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v8, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v9, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v8, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v5, 0x7ff80000, s0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f64__nsz: @@ -1086,69 +787,43 @@ define <2 x double> @v_minimum_v2f64__nnan_nsz(<2 x double> %src0, <2 x double> ; GFX7-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[4:5] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[6:7] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v3, s4 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v2f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[6:7] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v0 :: v_dual_cndmask_b32 v1, v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v3, s0 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[6:7] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f64__nnan_nsz: @@ -1170,61 +845,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s10 +; GFX7-NEXT: v_mov_b32_e32 v4, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s11 -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX7-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 32 -; GFX7-NEXT: v_mov_b32_e32 v0, s8 -; GFX7-NEXT: v_mov_b32_e32 v1, s9 -; GFX7-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX7-NEXT: s_cselect_b32 s16, s7, s11 -; GFX7-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX7-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX7-NEXT: s_cselect_b32 s14, s6, s10 -; GFX7-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 32 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s14, s14, 0 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] -; GFX7-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s7, s7, s15 -; GFX7-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX7-NEXT: s_cselect_b32 s7, s11, s7 -; GFX7-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX7-NEXT: s_cselect_b32 s7, s7, s15 -; GFX7-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX7-NEXT: s_cselect_b32 s6, s6, s14 -; GFX7-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX7-NEXT: s_cselect_b32 s6, s10, s6 -; GFX7-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX7-NEXT: s_cselect_b32 s6, s6, s14 -; GFX7-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX7-NEXT: s_cselect_b32 s14, s5, s9 -; GFX7-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX7-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX7-NEXT: s_cselect_b32 s10, s4, s8 -; GFX7-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 32 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 32 -; GFX7-NEXT: s_cselect_b32 s10, s10, 0 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX7-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX7-NEXT: s_cselect_b32 s5, s5, s11 -; GFX7-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s5, s9, s5 -; GFX7-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s5, s5, s11 -; GFX7-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX7-NEXT: s_cselect_b32 s4, s4, s10 -; GFX7-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX7-NEXT: s_cselect_b32 s4, s8, s4 -; GFX7-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX7-NEXT: s_cselect_b32 s4, s4, s10 +; GFX7-NEXT: v_mov_b32_e32 v5, s9 +; GFX7-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX7-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX7-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX7-NEXT: ;;#ASMSTART -; GFX7-NEXT: ; use s[4:7] +; GFX7-NEXT: ; use v[0:3] ; GFX7-NEXT: ;;#ASMEND ; GFX7-NEXT: s_setpc_b64 s[30:31] ; @@ -1232,61 +866,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s10 +; GFX8-NEXT: v_mov_b32_e32 v4, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX8-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 32 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: v_mov_b32_e32 v1, s9 -; GFX8-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX8-NEXT: s_cselect_b32 s16, s7, s11 -; GFX8-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX8-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX8-NEXT: s_cselect_b32 s14, s6, s10 -; GFX8-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 32 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s14, s14, 0 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] -; GFX8-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s7, s7, s15 -; GFX8-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX8-NEXT: s_cselect_b32 s7, s11, s7 -; GFX8-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX8-NEXT: s_cselect_b32 s7, s7, s15 -; GFX8-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX8-NEXT: s_cselect_b32 s6, s6, s14 -; GFX8-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX8-NEXT: s_cselect_b32 s6, s10, s6 -; GFX8-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX8-NEXT: s_cselect_b32 s6, s6, s14 -; GFX8-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX8-NEXT: s_cselect_b32 s14, s5, s9 -; GFX8-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX8-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX8-NEXT: s_cselect_b32 s10, s4, s8 -; GFX8-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 32 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 32 -; GFX8-NEXT: s_cselect_b32 s10, s10, 0 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX8-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s11 -; GFX8-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s5, s9, s5 -; GFX8-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s5, s5, s11 -; GFX8-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX8-NEXT: s_cselect_b32 s4, s4, s10 -; GFX8-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX8-NEXT: s_cselect_b32 s4, s8, s4 -; GFX8-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX8-NEXT: s_cselect_b32 s4, s4, s10 +; GFX8-NEXT: v_mov_b32_e32 v5, s9 +; GFX8-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX8-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX8-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX8-NEXT: ;;#ASMSTART -; GFX8-NEXT: ; use s[4:7] +; GFX8-NEXT: ; use v[0:3] ; GFX8-NEXT: ;;#ASMEND ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1294,61 +887,20 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s10 +; GFX9-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], s[6:7], v[0:1] -; GFX9-NEXT: v_cmp_class_f64_e64 s[18:19], s[10:11], 32 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 -; GFX9-NEXT: v_mov_b32_e32 v1, s9 -; GFX9-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX9-NEXT: s_cselect_b32 s16, s7, s11 -; GFX9-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s15, s16, 0x7ff80000 -; GFX9-NEXT: s_and_b64 s[16:17], vcc, exec -; GFX9-NEXT: s_cselect_b32 s14, s6, s10 -; GFX9-NEXT: v_cmp_class_f64_e64 s[16:17], s[6:7], 32 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s14, s14, 0 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[20:21], s[14:15], 0 -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, s[4:5], v[0:1] -; GFX9-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s7, s7, s15 -; GFX9-NEXT: s_and_b64 s[12:13], s[18:19], exec -; GFX9-NEXT: s_cselect_b32 s7, s11, s7 -; GFX9-NEXT: s_and_b64 s[12:13], s[20:21], exec -; GFX9-NEXT: s_cselect_b32 s7, s7, s15 -; GFX9-NEXT: s_and_b64 s[12:13], s[16:17], exec -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], s[4:5], v[0:1] -; GFX9-NEXT: s_cselect_b32 s6, s6, s14 -; GFX9-NEXT: s_and_b64 s[16:17], s[18:19], exec -; GFX9-NEXT: s_cselect_b32 s6, s10, s6 -; GFX9-NEXT: s_and_b64 s[10:11], s[20:21], exec -; GFX9-NEXT: s_cselect_b32 s6, s6, s14 -; GFX9-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX9-NEXT: s_cselect_b32 s14, s5, s9 -; GFX9-NEXT: s_and_b64 s[10:11], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s11, s14, 0x7ff80000 -; GFX9-NEXT: s_and_b64 s[14:15], vcc, exec -; GFX9-NEXT: s_cselect_b32 s10, s4, s8 -; GFX9-NEXT: v_cmp_class_f64_e64 s[14:15], s[4:5], 32 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: v_cmp_class_f64_e64 s[12:13], s[8:9], 32 -; GFX9-NEXT: s_cselect_b32 s10, s10, 0 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[16:17], s[10:11], 0 -; GFX9-NEXT: s_and_b64 s[18:19], s[14:15], exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_and_b64 s[18:19], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s5, s9, s5 -; GFX9-NEXT: s_and_b64 s[18:19], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s5, s5, s11 -; GFX9-NEXT: s_and_b64 s[14:15], s[14:15], exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 -; GFX9-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX9-NEXT: s_cselect_b32 s4, s8, s4 -; GFX9-NEXT: s_and_b64 s[8:9], s[16:17], exec -; GFX9-NEXT: s_cselect_b32 s4, s4, s10 +; GFX9-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-NEXT: v_min_f64 v[2:3], s[6:7], v[0:1] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, s[6:7], v[0:1] +; GFX9-NEXT: v_min_f64 v[0:1], s[4:5], v[4:5] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], s[4:5], v[4:5] +; GFX9-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] ; GFX9-NEXT: ;;#ASMSTART -; GFX9-NEXT: ; use s[4:7] +; GFX9-NEXT: ; use v[0:3] ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1356,179 +908,52 @@ define void @s_minimum_v2f64(<2 x double> inreg %src0, <2 x double> inreg %src1) ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[6:7] -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, s[2:3], v[0:1] -; GFX940-NEXT: s_and_b64 s[8:9], vcc, exec -; GFX940-NEXT: v_cmp_o_f64_e64 s[8:9], s[2:3], v[0:1] -; GFX940-NEXT: s_cselect_b32 s12, s3, s7 -; GFX940-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX940-NEXT: s_cselect_b32 s11, s12, 0x7ff80000 -; GFX940-NEXT: s_and_b64 s[12:13], vcc, exec -; GFX940-NEXT: s_cselect_b32 s10, s2, s6 -; GFX940-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[12:13], s[2:3], 32 -; GFX940-NEXT: s_cselect_b32 s10, s10, 0 -; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[14:15], s[6:7], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[8:9], s[10:11], 0 -; GFX940-NEXT: s_cselect_b32 s3, s3, s11 -; GFX940-NEXT: s_and_b64 s[16:17], s[14:15], exec -; GFX940-NEXT: s_cselect_b32 s3, s7, s3 -; GFX940-NEXT: s_and_b64 s[16:17], s[8:9], exec -; GFX940-NEXT: s_cselect_b32 s7, s3, s11 -; GFX940-NEXT: s_and_b64 s[12:13], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s11, s2, s10 -; GFX940-NEXT: s_and_b64 s[2:3], s[14:15], exec +; GFX940-NEXT: v_min_f64 v[2:3], s[2:3], v[0:1] +; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[2:3], v[0:1] ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[4:5] -; GFX940-NEXT: s_cselect_b32 s6, s6, s11 -; GFX940-NEXT: s_and_b64 s[2:3], s[8:9], exec -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1] -; GFX940-NEXT: s_cselect_b32 s6, s6, s10 -; GFX940-NEXT: s_and_b64 s[2:3], vcc, exec -; GFX940-NEXT: v_cmp_o_f64_e64 s[2:3], s[0:1], v[0:1] -; GFX940-NEXT: s_cselect_b32 s10, s1, s5 -; GFX940-NEXT: s_and_b64 s[8:9], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s9, s10, 0x7ff80000 -; GFX940-NEXT: s_and_b64 s[10:11], vcc, exec -; GFX940-NEXT: s_cselect_b32 s8, s0, s4 -; GFX940-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[10:11], s[0:1], 32 -; GFX940-NEXT: s_cselect_b32 s8, s8, 0 -; GFX940-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX940-NEXT: v_cmp_class_f64_e64 s[12:13], s[4:5], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], s[8:9], 0 -; GFX940-NEXT: s_cselect_b32 s1, s1, s9 -; GFX940-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s1, s5, s1 -; GFX940-NEXT: s_and_b64 s[14:15], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s5, s1, s9 -; GFX940-NEXT: s_and_b64 s[10:11], s[10:11], exec -; GFX940-NEXT: s_cselect_b32 s9, s0, s8 -; GFX940-NEXT: s_and_b64 s[0:1], s[12:13], exec -; GFX940-NEXT: s_cselect_b32 s4, s4, s9 -; GFX940-NEXT: s_and_b64 s[0:1], s[2:3], exec -; GFX940-NEXT: s_cselect_b32 s4, s4, s8 +; GFX940-NEXT: v_min_f64 v[4:5], s[0:1], v[0:1] +; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, s[0:1], v[0:1] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v0, v4, 0, vcc ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; use s[4:7] +; GFX940-NEXT: ; use v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: s_minimum_v2f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e64 s12, s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s14, s[6:7], s[10:11] -; GFX10-NEXT: v_cmp_class_f64_e64 s15, s[6:7], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s16, s[10:11], 32 -; GFX10-NEXT: v_cmp_o_f64_e64 s18, s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_class_f64_e64 s19, s[4:5], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s20, s[8:9], 32 -; GFX10-NEXT: s_and_b32 s13, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s13, s7, s11 -; GFX10-NEXT: s_and_b32 s17, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s13, s13, 0x7ff80000 -; GFX10-NEXT: s_and_b32 s12, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s12, s6, s10 -; GFX10-NEXT: s_and_b32 s14, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s12, s12, 0 -; GFX10-NEXT: v_cmp_lt_f64_e64 s17, s[4:5], s[8:9] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, s[12:13], 0 -; GFX10-NEXT: s_and_b32 s21, s15, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s7, s13 -; GFX10-NEXT: s_and_b32 s21, s16, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s11, s7 -; GFX10-NEXT: s_and_b32 s11, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s7, s7, s13 -; GFX10-NEXT: s_and_b32 s11, s15, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s6, s12 -; GFX10-NEXT: s_and_b32 s11, s16, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s10, s6 -; GFX10-NEXT: s_and_b32 s10, s14, exec_lo -; GFX10-NEXT: s_cselect_b32 s6, s6, s12 -; GFX10-NEXT: s_and_b32 s10, s17, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s5, s9 -; GFX10-NEXT: s_and_b32 s11, s18, exec_lo -; GFX10-NEXT: s_cselect_b32 s11, s10, 0x7ff80000 -; GFX10-NEXT: s_and_b32 s10, s17, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s4, s8 -; GFX10-NEXT: s_and_b32 s12, s18, exec_lo -; GFX10-NEXT: s_cselect_b32 s10, s10, 0 -; GFX10-NEXT: s_and_b32 s13, s19, exec_lo -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, s[10:11], 0 -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s13, s20, exec_lo -; GFX10-NEXT: s_cselect_b32 s5, s9, s5 -; GFX10-NEXT: s_and_b32 s9, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s5, s5, s11 -; GFX10-NEXT: s_and_b32 s9, s19, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 -; GFX10-NEXT: s_and_b32 s9, s20, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s8, s4 -; GFX10-NEXT: s_and_b32 s8, s12, exec_lo -; GFX10-NEXT: s_cselect_b32 s4, s4, s10 +; GFX10-NEXT: v_min_f64 v[0:1], s[6:7], s[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, s[6:7], s[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], s[4:5], s[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, s[4:5], s[8:9] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v4, 0, s4 ; GFX10-NEXT: ;;#ASMSTART -; GFX10-NEXT: ; use s[4:7] +; GFX10-NEXT: ; use v[0:3] ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_minimum_v2f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e64 s8, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s10, s[2:3], s[6:7] -; GFX11-NEXT: v_cmp_class_f64_e64 s11, s[2:3], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s12, s[6:7], 32 -; GFX11-NEXT: v_cmp_o_f64_e64 s14, s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_class_f64_e64 s15, s[0:1], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s16, s[4:5], 32 -; GFX11-NEXT: s_and_b32 s9, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s9, s3, s7 -; GFX11-NEXT: s_and_b32 s13, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s9, s9, 0x7ff80000 -; GFX11-NEXT: s_and_b32 s8, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s8, s2, s6 -; GFX11-NEXT: s_and_b32 s10, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s8, s8, 0 -; GFX11-NEXT: v_cmp_lt_f64_e64 s13, s[0:1], s[4:5] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, s[8:9], 0 -; GFX11-NEXT: s_and_b32 s17, s11, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s9 -; GFX11-NEXT: s_and_b32 s17, s12, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s7, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s3, s3, s9 -; GFX11-NEXT: s_and_b32 s7, s11, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s8 -; GFX11-NEXT: s_and_b32 s7, s12, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s6, s2 -; GFX11-NEXT: s_and_b32 s6, s10, exec_lo -; GFX11-NEXT: s_cselect_b32 s2, s2, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: s_and_b32 s6, s13, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s1, s5 -; GFX11-NEXT: s_and_b32 s7, s14, exec_lo -; GFX11-NEXT: s_cselect_b32 s7, s6, 0x7ff80000 -; GFX11-NEXT: s_and_b32 s6, s13, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s0, s4 -; GFX11-NEXT: s_and_b32 s8, s14, exec_lo -; GFX11-NEXT: s_cselect_b32 s6, s6, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: s_and_b32 s9, s15, exec_lo -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, s[6:7], 0 -; GFX11-NEXT: s_cselect_b32 s1, s1, s7 -; GFX11-NEXT: s_and_b32 s9, s16, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: s_and_b32 s5, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s1, s1, s7 -; GFX11-NEXT: s_and_b32 s5, s15, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s6 -; GFX11-NEXT: s_and_b32 s5, s16, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s4, s0 -; GFX11-NEXT: s_and_b32 s4, s8, exec_lo -; GFX11-NEXT: s_cselect_b32 s0, s0, s6 +; GFX11-NEXT: v_min_f64 v[0:1], s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, s[2:3], s[6:7] +; GFX11-NEXT: v_min_f64 v[4:5], s[0:1], s[4:5] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, s[0:1], s[4:5] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v3, v1, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v5, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v4, 0, s0 ; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use s[0:3] +; GFX11-NEXT: ; use v[0:3] ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_setpc_b64 s[30:31] ; @@ -1554,306 +979,110 @@ define <3 x double> @v_minimum_v3f64(<3 x double> %src0, <3 x double> %src1) { ; GFX7-LABEL: v_minimum_v3f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX7-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 32 -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX7-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX7-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX8-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 32 -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX8-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[2:3], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[8:9], 32 -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[10:11] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v11, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, v8, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_mov_b32_e32 v14, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v13, v14, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v12, 0, v12, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[12:13] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v11, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] +; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v7, v14, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v10, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v6, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v12, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v14, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v10, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v13, 0x7ff80000, v12, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v15, 0x7ff80000, v14, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v12, 0, v17, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v14, 0, v18, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, v19, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[8:9], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[6:7], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[12:13] -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[14:15] -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[16:17] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s11 +; GFX10-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v12, v7, v1 :: v_dual_cndmask_b32 v17, v6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v10, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v13, 0x7ff80000, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v15, 0x7ff80000, v14, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v12, 0, v17, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v14, 0, v18, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, v19, s4 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[8:9], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[10:11], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[12:13] -; GFX11-NEXT: v_cmp_eq_f64_e64 s6, 0, v[14:15] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v8, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, v1, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s7 +; GFX11-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f64: @@ -1875,247 +1104,49 @@ define <3 x double> @v_minimum_v3f64__nnan(<3 x double> %src0, <3 x double> %src ; GFX7-LABEL: v_minimum_v3f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[6:7], 32 -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v2, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v13, v11, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v12, v10, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[6:7] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[12:13] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[6:7], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[12:13] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 32 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v13, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v8, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[0:1] -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v11, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v6, v10, v4, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[6:7] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v7, v5, s[2:3] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[8:9], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[6:7], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 32 -; GFX10-NEXT: v_cndmask_b32_e32 v13, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v15, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v17, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v10, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[12:13] -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[14:15] -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[16:17] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v13, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v8, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v9, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v14, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v16, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v15, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v17, v5, s11 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[8:9], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[10:11], 32 -; GFX11-NEXT: v_dual_cndmask_b32 v13, v7, v1 :: v_dual_cndmask_b32 v12, v6, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v17, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v10, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[12:13] -; GFX11-NEXT: v_cmp_eq_f64_e64 s6, 0, v[14:15] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v12, v0 :: v_dual_cndmask_b32 v1, v13, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v8, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v10, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v9, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v11, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, v0, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v14, v2, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v16, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, v1, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v15, v3, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v17, v5, s7 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f64__nnan: @@ -2137,144 +1168,110 @@ define <3 x double> @v_minimum_v3f64__nsz(<3 x double> %src0, <3 x double> %src1 ; GFX7-LABEL: v_minimum_v3f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX7-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX7-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX7-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX7-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX8-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX8-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v6, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[10:11] +; GFX9-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[10:11] +; GFX9-NEXT: v_mov_b32_e32 v5, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v13, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[6:7] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v6, 0x7ff80000 -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v6, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v8, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v3, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v10, v4, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GFX940-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[6:7] +; GFX940-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v7, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v6, v5, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v12, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v13, v12, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[8:9] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v7, v12, vcc +; GFX940-NEXT: v_min_f64 v[6:7], v[4:5], v[10:11] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v12, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v12, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s8 +; GFX10-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v6, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v12, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s4 +; GFX11-NEXT: v_min_f64 v[12:13], v[0:1], v[6:7] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[6:7] +; GFX11-NEXT: v_min_f64 v[6:7], v[2:3], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[8:9], v[4:5], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v13, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v6, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, 0x7ff80000, s1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f64__nsz: @@ -2296,88 +1293,49 @@ define <3 x double> @v_minimum_v3f64__nnan_nsz(<3 x double> %src0, <3 x double> ; GFX7-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v5, s[6:7] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[6:7] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v11, v5, vcc +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[10:11] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, v5, s5 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v3f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[6:7] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[10:11] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v6, v0 :: v_dual_cndmask_b32 v1, v7, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, v5, s1 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[6:7] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[8:9] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[10:11] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f64__nnan_nsz: @@ -2399,404 +1357,135 @@ define <4 x double> @v_minimum_v4f64(<4 x double> %src0, <4 x double> %src1) { ; GFX7-LABEL: v_minimum_v4f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX7-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX7-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX7-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX7-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX8-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX8-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX9-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v19, v11, v3, s[6:7] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[10:11], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v10, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v18, v19, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v13, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v18, v10, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v11, v18, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[12:13] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[12:13], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, v10, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[14:15], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[10:11] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_mov_b32_e32 v18, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v18, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v11, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v13, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[2:3] +; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[12:13], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v15, v7, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v9, v18, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, 0, v8, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[14:15], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s6, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s7, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_lt_f64_e64 s8, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s10, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[14:15], 32 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v19, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v21, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v13, v5, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v15, v7, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v16, 0, v19, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v19, 0x7ff80000, v18, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v12, v4, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v18, 0, v21, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v14, v6, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[4:5], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[6:7], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v21, 0x7ff80000, v20, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v20, 0, v23, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v23, 0x7ff80000, v22, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v22, 0, v24, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s9, v[10:11], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s6, 0, v[16:17] -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[18:19] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[20:21] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[22:23] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s14 +; GFX10-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_lt_f64_e64 s2, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_lt_f64_e64 s3, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_o_f64_e64 s6, v[6:7], v[14:15] -; GFX11-NEXT: v_cndmask_b32_e32 v16, v9, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v18, v11, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v13, v5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v22, v15, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v17, 0x7ff80000, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v19, 0x7ff80000, v18, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v21, 0x7ff80000, v20, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v10, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v12, v4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v14, v6, s3 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v23, 0x7ff80000, v22, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v18, 0, v18, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v20, 0, v20, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v22, 0, v24, s6 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[12:13], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[14:15], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[10:11], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, 0, v[18:19] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[20:21] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, 0, v[22:23] -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v16, 0, v16, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v14, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v15, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s8 +; GFX11-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[12:13] +; GFX11-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f64: @@ -2819,320 +1508,55 @@ define <4 x double> @v_minimum_v4f64__nnan(<4 x double> %src0, <4 x double> %src ; GFX7-LABEL: v_minimum_v4f64__nnan: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX7-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f64__nnan: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f64__nnan: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[8:9], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[10:11], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v19, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v18, v10, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v13, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[10:11], v[14:15], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v18, v2, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v19, v3, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v15, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v10, v14, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v11, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v6, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v7, s[12:13] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f64__nnan: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[8:9], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v8, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v11, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v10, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[10:11], 32 -; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v1, v17, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[0:1] -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v13, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v12, v4, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[12:13], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[0:1] -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v9, v5, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v15, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v8, v14, v6, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[14:15], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[8:9] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v14, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v15, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[2:3] +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64__nnan: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_lt_f64_e64 s6, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[10:11], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[8:9], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[14:15], 32 -; GFX10-NEXT: v_cndmask_b32_e32 v17, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v19, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v15, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v18, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v20, v12, v4, s5 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[4:5], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v14, v6, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[6:7], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[2:3], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[16:17] -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[18:19] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[20:21] -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[22:23] -; GFX10-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v10, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v14, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v9, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v11, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v13, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v15, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v18, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v19, v3, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v20, v4, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v21, v5, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v22, v6, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v23, v7, s14 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f64__nnan: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_lt_f64_e64 s2, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[6:7], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[14:15], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[10:11], 32 -; GFX11-NEXT: v_dual_cndmask_b32 v17, v9, v1 :: v_dual_cndmask_b32 v16, v8, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v21, v13, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v12, v4, s1 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[2:3], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[4:5], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v23, v15, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v22, v14, v6, s2 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[12:13], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[16:17] -; GFX11-NEXT: v_cmp_eq_f64_e64 s8, 0, v[18:19] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[20:21] -; GFX11-NEXT: v_cmp_eq_f64_e64 s10, 0, v[22:23] -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v14, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v15, s4 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v0 :: v_dual_cndmask_b32 v1, v17, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v12, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v13, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, v0, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v18, v2, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v20, v4, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v22, v6, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, v1, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v19, v3, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v21, v5, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v23, v7, s10 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f64__nnan: @@ -3155,180 +1579,135 @@ define <4 x double> @v_minimum_v4f64__nsz(<4 x double> %src0, <4 x double> %src1 ; GFX7-LABEL: v_minimum_v4f64__nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX7-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX7-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX7-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX7-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX7-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f64__nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX8-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX8-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f64__nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[10:11], v[6:7], v[14:15] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v14, v6, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v12, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, v10, s[12:13] +; GFX9-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[14:15] +; GFX9-NEXT: v_mov_b32_e32 v7, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v8, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v9, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v10, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v11, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v12, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[8:9] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f64__nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[8:9] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_mov_b32_e32 v8, 0x7ff80000 -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e64 v0, 0, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v8, v1, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v10, v2, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e64 v2, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v12, v4, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[12:13] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e64 v4, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v14, v6, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[14:15] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GFX940-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[8:9] +; GFX940-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v6, 0, v9, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v8, v7, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc +; GFX940-NEXT: v_mov_b32_e32 v16, 0x7ff80000 +; GFX940-NEXT: v_cndmask_b32_e32 v1, v17, v16, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[10:11] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v16, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[4:5], v[12:13] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[12:13] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v4, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v16, vcc +; GFX940-NEXT: v_min_f64 v[8:9], v[6:7], v[14:15] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[14:15] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v9, v16, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64__nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_lt_f64_e64 s7, v[6:7], v[14:15] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_o_f64_e64 s10, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v16, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v8, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v12, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v14, v6, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, v16, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, v8, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, v10, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, 0x7ff80000, v7, s10 +; GFX10-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v10, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v12, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s6 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f64__nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_lt_f64_e64 s2, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_lt_f64_e64 s3, v[6:7], v[14:15] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_o_f64_e64 s4, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_o_f64_e64 s6, v[6:7], v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v16, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v10, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v12, v4, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v14, v6, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v15, v7, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, v16, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, v10, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, v12, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0x7ff80000, v1, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0x7ff80000, v3, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v5, 0x7ff80000, v5, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v7, 0x7ff80000, v7, s6 +; GFX11-NEXT: v_min_f64 v[16:17], v[0:1], v[8:9] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[8:9], v[2:3], v[10:11] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[10:11], v[4:5], v[12:13] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[12:13] +; GFX11-NEXT: v_min_f64 v[12:13], v[6:7], v[14:15] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v16, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v17, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v9, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v10, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v11, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v12, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v13, 0x7ff80000, s2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f64__nsz: @@ -3351,108 +1730,55 @@ define <4 x double> @v_minimum_v4f64__nnan_nsz(<4 x double> %src0, <4 x double> ; GFX7-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX7-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX8-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[4:5], v[12:13] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[6:7], v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v2, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v14, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v15, v7, s[8:9] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[8:9] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[10:11] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[12:13] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[14:15] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc +; GFX940-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX940-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX940-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX940-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[2:3], v[10:11] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[4:5], v[12:13] -; GFX10-NEXT: v_cmp_lt_f64_e64 s6, v[6:7], v[14:15] -; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v2, v10, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v14, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v3, v11, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v15, v7, s6 +; GFX10-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX10-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX10-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX10-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v4f64__nnan_nsz: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[8:9] -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[2:3], v[10:11] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[4:5], v[12:13] -; GFX11-NEXT: v_cmp_lt_f64_e64 s2, v[6:7], v[14:15] -; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v2, v10, v2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v4, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v14, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v11, v3, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v5, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v15, v7, s2 +; GFX11-NEXT: v_min_f64 v[0:1], v[0:1], v[8:9] +; GFX11-NEXT: v_min_f64 v[2:3], v[2:3], v[10:11] +; GFX11-NEXT: v_min_f64 v[4:5], v[4:5], v[12:13] +; GFX11-NEXT: v_min_f64 v[6:7], v[6:7], v[14:15] ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f64__nnan_nsz: @@ -3475,782 +1801,244 @@ define <8 x double> @v_minimum_v8f64(<8 x double> %src0, <8 x double> %src1) { ; GFX7-LABEL: v_minimum_v8f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX7-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX7-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 32 -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX7-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX7-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX7-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX7-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX7-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX7-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] +; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX7-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] +; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX7-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] +; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX7-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX7-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v8f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX8-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX8-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 32 -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX8-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX8-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX8-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX8-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX8-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX8-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] +; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX8-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] +; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX8-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] +; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX8-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v8f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[16:17] -; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[20:21] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[8:9], v[24:25] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[10:11], v[12:13], v[28:29] -; GFX9-NEXT: v_cmp_o_f64_e64 s[12:13], v[12:13], v[28:29] -; GFX9-NEXT: v_cndmask_b32_e32 v31, v17, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v34, v32, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v31, v16, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v31, s[4:5] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[2:3], v[18:19] ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v17, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v33, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v34, v1, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[4:5], v[20:21] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v19, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v18, v21, v5, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v20, v4, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v18, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[6:7] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[22:23] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[4:5] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[24:25] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v23, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[22:23], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v16, v25, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v24, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[18:19] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[8:9], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[24:25], 32 -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[26:27] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v18, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v7, v19, v7, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[16:17] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[26:27], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v27, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v16, s[8:9] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[8:9], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[4:5] +; GFX9-NEXT: v_min_f64 v[32:33], v[2:3], v[18:19] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX9-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[4:5], v[20:21] +; GFX9-NEXT: v_min_f64 v[2:3], v[0:1], v[16:17] +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[0:1], v[16:17] +; GFX9-NEXT: v_mov_b32_e32 v34, 0x7ff80000 +; GFX9-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[6:7], v[22:23] +; GFX9-NEXT: v_min_f64 v[16:17], v[8:9], v[24:25] +; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[24:25] +; GFX9-NEXT: v_min_f64 v[22:23], v[10:11], v[26:27] +; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[26:27] +; GFX9-NEXT: v_min_f64 v[24:25], v[12:13], v[28:29] +; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, 0, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v3, v34, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v32, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v33, v34, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v18, 0, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v19, v34, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v20, 0, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v21, v34, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v16, 0, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v17, v34, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v22, 0, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v23, v34, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v12, v24, 0, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v25, v34, s[14:15] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v29, v13, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v28, v12, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v18, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v19, v11, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v18, v10, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v19, v11, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[12:13], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[12:13] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[28:29], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v18, v31, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v19, v32, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v30, v14, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v18, 0, v18, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[30:31], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[16:17] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[18:19] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v18, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v15, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, v14, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v19, v15, s[12:13] +; GFX9-NEXT: v_min_f64 v[18:19], v[14:15], v[30:31] +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v14, v18, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v15, v19, v34, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v8f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[16:17] -; GFX940-NEXT: v_mov_b32_e32 v32, 0x7ff80000 -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v33, v17, v1, vcc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v35, v32, v33, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v33, v16, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v34, 0, v33, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[16:17], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[34:35] -; GFX940-NEXT: v_cndmask_b32_e32 v0, v34, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v35, v1, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[18:19] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v19, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[18:19] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v34, v0, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v35, v1, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v18, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[18:19], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v2, v18, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v2, v16, v2, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v3, v19, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v21, v5, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[20:21] -; GFX940-NEXT: v_cndmask_b32_e64 v3, v17, v3, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v20, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[20:21], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v20, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[22:23] -; GFX940-NEXT: v_cndmask_b32_e64 v4, v16, v4, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v21, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v23, v7, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[22:23] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v17, v5, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v22, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[22:23], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v16, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v22, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v6, v16, v6, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v23, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v25, v9, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[8:9], v[24:25] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v17, v7, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v24, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[24:25], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v16, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v8, v8, v24, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[26:27] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v16, v8, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v9, v25, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v27, v11, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[10:11], v[26:27] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v17, v9, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v26, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[26:27], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v10, v16, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v10, v10, v26, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v16, v10, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v11, v27, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v29, v13, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[12:13], v[28:29] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v17, v11, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v28, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[28:29], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v16, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v12, v12, v28, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc +; GFX940-NEXT: v_mov_b32_e32 v54, 0x7ff80000 +; GFX940-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[16:17] +; GFX940-NEXT: v_min_f64 v[34:35], v[2:3], v[18:19] +; GFX940-NEXT: v_min_f64 v[36:37], v[4:5], v[20:21] +; GFX940-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v1, v33, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[18:19] +; GFX940-NEXT: v_min_f64 v[38:39], v[6:7], v[22:23] +; GFX940-NEXT: v_min_f64 v[48:49], v[8:9], v[24:25] +; GFX940-NEXT: v_cndmask_b32_e64 v2, v34, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v35, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[20:21] +; GFX940-NEXT: v_min_f64 v[50:51], v[10:11], v[26:27] +; GFX940-NEXT: v_min_f64 v[52:53], v[12:13], v[28:29] +; GFX940-NEXT: v_cndmask_b32_e64 v4, v36, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v5, v37, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[22:23] ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v16, v12, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v13, v29, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v31, v15, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[14:15], v[30:31] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v17, v13, s[2:3] -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e64 v17, v32, v16, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, 0, v16, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[30:31], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[16:17] -; GFX940-NEXT: v_cndmask_b32_e32 v14, v16, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v15, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v14, v14, v30, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v15, v31, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, v14, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v17, v15, s[2:3] +; GFX940-NEXT: v_min_f64 v[16:17], v[14:15], v[30:31] +; GFX940-NEXT: v_cndmask_b32_e64 v6, v38, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v7, v39, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[24:25] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v8, v48, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v49, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[26:27] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v10, v50, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v51, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[28:29] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v12, v52, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v53, v54, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[30:31] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v14, v16, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v17, v54, vcc ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v8f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[16:17] -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[0:1], v[16:17] -; GFX10-NEXT: v_cmp_lt_f64_e64 s9, v[6:7], v[22:23] -; GFX10-NEXT: v_cmp_lt_f64_e64 s10, v[8:9], v[24:25] -; GFX10-NEXT: v_cmp_lt_f64_e64 s11, v[10:11], v[26:27] -; GFX10-NEXT: v_cmp_lt_f64_e64 s12, v[12:13], v[28:29] -; GFX10-NEXT: v_cmp_o_f64_e64 s13, v[6:7], v[22:23] -; GFX10-NEXT: v_cmp_o_f64_e64 s14, v[8:9], v[24:25] -; GFX10-NEXT: v_cmp_o_f64_e64 s15, v[10:11], v[26:27] -; GFX10-NEXT: v_cmp_o_f64_e64 s16, v[12:13], v[28:29] -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[2:3], v[18:19] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[18:19] -; GFX10-NEXT: v_cmp_lt_f64_e64 s7, v[4:5], v[20:21] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[20:21] -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[26:27], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s18, v[28:29], 32 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v17, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v38, v23, v7, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v25, v9, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v27, v11, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0x7ff80000, v32, s4 -; GFX10-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v52, v29, v13, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[0:1], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v38, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v48, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v32, 0, v32, s4 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[2:3], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v52, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v38, v22, v6, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v24, v8, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v50, v26, v10, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v52, v28, v12, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s11, v[16:17], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[18:19], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v19, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v38, 0, v38, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v21, v5, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 s9, v[12:13], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v48, 0, v48, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0x7ff80000, v34, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v18, v2, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v36, v20, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v0, v32, v0, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[4:5], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0, v34, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[6:7], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v36, 0, v36, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[8:9], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[10:11], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v34, v2, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v50, 0, v50, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v52, 0, v52, s16 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[20:21], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v16, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v18, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s15, v[22:23], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s16, v[24:25], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s19, 0, v[32:33] -; GFX10-NEXT: v_cmp_eq_f64_e64 s20, 0, v[34:35] -; GFX10-NEXT: v_cmp_eq_f64_e64 s21, 0, v[36:37] -; GFX10-NEXT: v_cmp_eq_f64_e64 s22, 0, v[48:49] -; GFX10-NEXT: v_cmp_eq_f64_e64 s23, 0, v[50:51] -; GFX10-NEXT: v_cmp_eq_f64_e64 s24, 0, v[52:53] -; GFX10-NEXT: v_cndmask_b32_e32 v1, v33, v1, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v12, v52, v12, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v36, v4, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v35, v3, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v38, v6, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v37, v5, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v50, v10, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v39, v7, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, v9, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v51, v11, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v53, v13, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v20, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v10, v26, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, v22, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v24, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v28, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v17, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v19, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v21, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, v23, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v25, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v11, v27, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v29, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v32, v0, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v34, v2, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v36, v4, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v48, v8, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v50, v10, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v52, v12, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v33, v1, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v35, v3, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v37, v5, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v49, v9, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v51, v11, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v53, v13, s24 +; GFX10-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[16:17] +; GFX10-NEXT: v_min_f64 v[16:17], v[2:3], v[18:19] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[2:3], v[18:19] +; GFX10-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[4:5], v[20:21] +; GFX10-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[6:7], v[22:23] +; GFX10-NEXT: v_min_f64 v[22:23], v[8:9], v[24:25] +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[8:9], v[24:25] +; GFX10-NEXT: v_min_f64 v[24:25], v[10:11], v[26:27] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[10:11], v[26:27] +; GFX10-NEXT: v_min_f64 v[26:27], v[12:13], v[28:29] +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[28:29] +; GFX10-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v33, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v16, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v17, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v4, v18, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v19, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v21, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v22, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v23, 0x7ff80000, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v10, v24, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v25, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v26, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v27, 0x7ff80000, s9 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_lt_f64_e64 s10, v[14:15], v[30:31] -; GFX10-NEXT: v_cmp_o_f64_e64 s13, v[14:15], v[30:31] -; GFX10-NEXT: v_cmp_class_f64_e64 s25, v[30:31], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v31, v15, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v30, v14, s10 -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v16, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v54, 0, v18, s13 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[14:15], 32 -; GFX10-NEXT: v_cmp_eq_f64_e32 vcc_lo, 0, v[54:55] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v38, v6, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v39, v7, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v54, v14, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v55, v15, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v30, s25 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v31, s25 -; GFX10-NEXT: v_cndmask_b32_e32 v14, v54, v14, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v15, v55, v15, vcc_lo +; GFX10-NEXT: v_min_f64 v[28:29], v[14:15], v[30:31] +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[30:31] +; GFX10-NEXT: v_cndmask_b32_e64 v14, v28, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v29, 0x7ff80000, s10 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v8f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cmp_lt_f64_e64 s4, v[6:7], v[22:23] -; GFX11-NEXT: v_cmp_o_f64_e64 s9, v[6:7], v[22:23] -; GFX11-NEXT: v_cmp_lt_f64_e64 s1, v[2:3], v[18:19] -; GFX11-NEXT: v_cmp_lt_f64_e64 s6, v[10:11], v[26:27] -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[16:17] -; GFX11-NEXT: v_cmp_o_f64_e64 s2, v[2:3], v[18:19] -; GFX11-NEXT: v_cmp_o_f64_e64 s11, v[10:11], v[26:27] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[0:1], v[16:17] -; GFX11-NEXT: v_cmp_lt_f64_e64 s3, v[4:5], v[20:21] -; GFX11-NEXT: v_cmp_lt_f64_e64 s5, v[8:9], v[24:25] -; GFX11-NEXT: v_cmp_lt_f64_e64 s7, v[12:13], v[28:29] -; GFX11-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[20:21] -; GFX11-NEXT: v_cmp_o_f64_e64 s10, v[8:9], v[24:25] -; GFX11-NEXT: v_cmp_o_f64_e64 s12, v[12:13], v[28:29] -; GFX11-NEXT: v_cmp_class_f64_e64 s13, v[18:19], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s15, v[20:21], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v38, v23, v7, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v34, v19, v3, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v50, v27, v11, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v38, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v38, v22, v6, s4 -; GFX11-NEXT: v_cmp_class_f64_e64 s4, v[6:7], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v35, 0x7ff80000, v34, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v50, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v34, v18, v2, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v50, v26, v10, s6 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[0:1], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v36, v21, v5, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v48, v25, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v52, v29, v13, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v50, 0, v50, s11 -; GFX11-NEXT: v_cmp_class_f64_e64 s11, v[16:17], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v37, 0x7ff80000, v36, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v48, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v52, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v36, v20, v4, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v48, v24, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v52, v28, v12, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v34, 0, v34, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v38, 0, v38, s9 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[2:3], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[4:5], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s5, v[8:9], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s7, v[10:11], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s9, v[12:13], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v36, 0, v36, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v48, 0, v48, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v52, 0, v52, s12 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[24:25], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s8, v[26:27], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[28:29], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s14, 0, v[34:35] -; GFX11-NEXT: v_cmp_eq_f64_e64 s16, 0, v[36:37] -; GFX11-NEXT: v_cmp_eq_f64_e64 s17, 0, v[38:39] -; GFX11-NEXT: v_cmp_eq_f64_e64 s18, 0, v[48:49] -; GFX11-NEXT: v_cmp_eq_f64_e64 s20, 0, v[50:51] -; GFX11-NEXT: v_cmp_eq_f64_e64 s21, 0, v[52:53] -; GFX11-NEXT: v_cndmask_b32_e64 v7, v39, v7, s4 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v17, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v6, v38, v6, s4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v33, 0x7ff80000, v32, s0 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v16, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, v1, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v32, 0, v32, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v17, s11 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, v0, s1 -; GFX11-NEXT: v_cmp_eq_f64_e64 s12, 0, v[32:33] -; GFX11-NEXT: v_cndmask_b32_e64 v2, v34, v2, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v36, v4, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v48, v8, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v16, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v50, v10, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v52, v12, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v35, v3, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v37, v5, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v49, v9, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v51, v11, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v53, v13, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v18, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v20, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v24, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v26, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v28, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v19, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v21, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v25, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v27, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v29, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v34, v2, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v36, v4, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v48, v8, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v50, v10, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v52, v12, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v35, v3, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v37, v5, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v9, v49, v9, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v51, v11, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v13, v53, v13, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, v0, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, v1, s12 +; GFX11-NEXT: v_min_f64 v[32:33], v[0:1], v[16:17] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[16:17] +; GFX11-NEXT: v_min_f64 v[16:17], v[2:3], v[18:19] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[18:19] +; GFX11-NEXT: v_min_f64 v[18:19], v[4:5], v[20:21] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[20:21] +; GFX11-NEXT: v_min_f64 v[20:21], v[6:7], v[22:23] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[22:23] +; GFX11-NEXT: v_min_f64 v[22:23], v[8:9], v[24:25] +; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[24:25] +; GFX11-NEXT: v_min_f64 v[24:25], v[10:11], v[26:27] +; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[26:27] +; GFX11-NEXT: v_min_f64 v[26:27], v[12:13], v[28:29] +; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[28:29] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v32, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v33, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v16, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v17, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v18, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v19, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v21, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v22, 0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v23, 0x7ff80000, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v24, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v25, 0x7ff80000, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v26, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v27, 0x7ff80000, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[14:15], v[30:31] -; GFX11-NEXT: v_cmp_o_f64_e64 s0, v[14:15], v[30:31] -; GFX11-NEXT: v_cmp_class_f64_e64 s19, v[30:31], 32 -; GFX11-NEXT: v_cndmask_b32_e32 v54, v31, v15, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v16, v30, v14, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v54, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v54, 0, v16, s0 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[14:15], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cmp_eq_f64_e64 s22, 0, v[54:55] -; GFX11-NEXT: v_dual_cndmask_b32 v7, v7, v23 :: v_dual_cndmask_b32 v6, v6, v22 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v54, v14, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v55, v15, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v6, v38, v6, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v39, v7, s17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v30, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v31, s19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v14, v54, v14, s22 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v55, v15, s22 +; GFX11-NEXT: v_min_f64 v[28:29], v[14:15], v[30:31] +; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[30:31] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e64 v14, v28, 0, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v29, 0x7ff80000, s6 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v8f64: @@ -4279,1799 +2067,798 @@ define <16 x double> @v_minimum_v16f64(<16 x double> %src0, <16 x double> %src1) ; GFX7-LABEL: v_minimum_v16f64: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX7-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(2) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX7-NEXT: v_writelane_b32 v34, s30, 0 +; GFX7-NEXT: v_writelane_b32 v34, s31, 1 +; GFX7-NEXT: v_writelane_b32 v34, s34, 2 +; GFX7-NEXT: v_writelane_b32 v34, s35, 3 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX7-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX7-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX7-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX7-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX7-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX7-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX7-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX7-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX7-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 32 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX7-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX7-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX7-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX7-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX7-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX7-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX7-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX7-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX7-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX7-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX7-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX7-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(4) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX7-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 32 -; GFX7-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX7-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX7-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX7-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX7-NEXT: s_waitcnt vmcnt(6) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX7-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX7-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX7-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX7-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX7-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX7-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX7-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX7-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX7-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX7-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] +; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX7-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX7-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX7-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 32 -; GFX7-NEXT: s_waitcnt vmcnt(7) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX7-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX7-NEXT: s_waitcnt vmcnt(5) -; GFX7-NEXT: v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX7-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX7-NEXT: s_waitcnt vmcnt(3) -; GFX7-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX7-NEXT: v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX7-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX7-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX7-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX7-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] +; GFX7-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX7-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX7-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX7-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX7-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX7-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX7-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX7-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX7-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX7-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX7-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX7-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX7-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX7-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX7-NEXT: v_readlane_b32 s35, v34, 3 +; GFX7-NEXT: v_readlane_b32 s34, v34, 2 +; GFX7-NEXT: v_readlane_b32 s31, v34, 1 +; GFX7-NEXT: v_readlane_b32 s30, v34, 0 +; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX7-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33] -; GFX7-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX7-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX7-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 32 -; GFX7-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX7-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 32 -; GFX7-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 32 -; GFX7-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX7-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX7-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX7-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX7-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX7-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX7-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_minimum_v16f64: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX8-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX8-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX8-NEXT: v_writelane_b32 v34, s30, 0 +; GFX8-NEXT: v_writelane_b32 v34, s31, 1 +; GFX8-NEXT: v_writelane_b32 v34, s34, 2 +; GFX8-NEXT: v_writelane_b32 v34, s35, 3 +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX8-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX8-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX8-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX8-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX8-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX8-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX8-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX8-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX8-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 32 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX8-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX8-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX8-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX8-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX8-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX8-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX8-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX8-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX8-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX8-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX8-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX8-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 32 -; GFX8-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX8-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX8-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX8-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX8-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX8-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX8-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX8-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX8-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX8-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX8-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX8-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX8-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX8-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] +; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX8-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX8-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX8-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX8-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 32 -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX8-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX8-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX8-NEXT: v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX8-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX8-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX8-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX8-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] +; GFX8-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX8-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX8-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX8-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX8-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX8-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX8-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX8-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX8-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX8-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX8-NEXT: v_readlane_b32 s35, v34, 3 +; GFX8-NEXT: v_readlane_b32 s34, v34, 2 +; GFX8-NEXT: v_readlane_b32 s31, v34, 1 +; GFX8-NEXT: v_readlane_b32 s30, v34, 0 +; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX8-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33] -; GFX8-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX8-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX8-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 32 -; GFX8-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX8-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 32 -; GFX8-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 32 -; GFX8-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX8-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX8-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX8-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX8-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_minimum_v16f64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:4 -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:16 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12 -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:24 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:20 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:32 -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:28 -; GFX9-NEXT: v_mov_b32_e32 v39, 0x7ff80000 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[31:32] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[0:1], v[31:32] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[2:3], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v48, v32, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v48, v31, v0, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v32, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v34, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v31, v33, v2, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v32, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v31, 0, v31, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[4:5], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v34, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[35:36], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v36, v5, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v33, v35, v4, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v34, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v33, 0, v33, s[6:7] +; GFX9-NEXT: v_writelane_b32 v34, s30, 0 +; GFX9-NEXT: v_writelane_b32 v34, s31, 1 +; GFX9-NEXT: v_writelane_b32 v34, s34, 2 +; GFX9-NEXT: v_writelane_b32 v34, s35, 3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[31:32] +; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[6:7], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v35, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v36, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v50, v38, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v35, v37, v6, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v38, s[4:5] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e32 v0, v48, v0, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v49, v1, vcc -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:48 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:44 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v2, v31, v2, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v32, v3, vcc +; GFX9-NEXT: v_cmp_u_f64_e64 s[4:5], v[2:3], v[31:32] +; GFX9-NEXT: v_min_f64 v[2:3], v[2:3], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:20 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[6:7], v[4:5], v[31:32] +; GFX9-NEXT: v_min_f64 v[4:5], v[4:5], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:28 +; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[6:7] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[8:9], v[6:7], v[31:32] +; GFX9-NEXT: v_min_f64 v[6:7], v[6:7], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:36 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, 0, s[8:9] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[10:11], v[8:9], v[31:32] +; GFX9-NEXT: v_min_f64 v[8:9], v[8:9], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:44 +; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[10:11] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[12:13], v[10:11], v[31:32] +; GFX9-NEXT: v_min_f64 v[10:11], v[10:11], v[31:32] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:52 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v33, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v34, v5, vcc -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:64 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60 -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v35, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v7, v36, v7, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 32 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[8:9], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[8:9], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v50, v38, v9, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v37, v8, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v35, v8, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e32 v9, v36, v9, vcc -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[48:49] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[10:11], v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v38, s[4:5] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:68 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:72 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v35, v8, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v50, v49, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v35, v48, v10, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v35, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v36, v9, s[6:7] -; GFX9-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:80 -; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:76 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[12:13], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v49, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[12:13], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v50, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v48, v32, v13, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v11, v51, v11, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v31, v12, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[31:32], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[14:15], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, v31, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, s[12:13] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[14:15], v[12:13], v[31:32] +; GFX9-NEXT: v_min_f64 v[12:13], v[12:13], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:60 +; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, s[14:15] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[16:17], v[14:15], v[31:32] +; GFX9-NEXT: v_min_f64 v[14:15], v[14:15], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:68 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72 +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, s[16:17] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[18:19], v[16:17], v[31:32] +; GFX9-NEXT: v_min_f64 v[16:17], v[16:17], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:76 +; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, 0, s[18:19] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[20:21], v[18:19], v[31:32] +; GFX9-NEXT: v_min_f64 v[18:19], v[18:19], v[31:32] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:84 -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[14:15], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v48, v12, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v13, v49, v13, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v50, v34, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v33, v14, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v34, s[4:5] -; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:96 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92 -; GFX9-NEXT: v_cndmask_b32_e32 v14, v48, v14, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v15, v49, v15, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[16:17], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[16:17], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v50, v38, v17, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v37, v16, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v49, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[37:38], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[18:19], v[35:36] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[18:19], v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v16, v16, v37, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v38, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v50, v36, v19, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v37, v35, v18, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[48:49] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[18:19], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v51, v39, v50, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v50, 0, v37, s[6:7] -; GFX9-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:100 -; GFX9-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:104 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v48, v16, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, v49, v17, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[35:36], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v18, v50, v18, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v19, v51, v19, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(4) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[20:21], v[31:32] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[20:21], v[31:32] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v18, v35, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, v19, v36, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[50:51] -; GFX9-NEXT: v_cndmask_b32_e64 v48, v32, v21, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v31, v20, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[20:21], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v48, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v35, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v18, v50, v18, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v19, v51, v19, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[31:32], 32 -; GFX9-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:112 -; GFX9-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:108 -; GFX9-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:120 -; GFX9-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:116 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[35:36] -; GFX9-NEXT: s_waitcnt vmcnt(6) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[22:23], v[33:34] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[22:23], v[33:34] -; GFX9-NEXT: v_cndmask_b32_e32 v20, v20, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v21, v21, v32, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v20, v35, v20, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[33:34], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v31, v34, v23, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v39, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v33, v22, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, 0, v31, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX9-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[35:36] -; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, v33, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v18, v18, 0, s[20:21] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[22:23], v[20:21], v[31:32] +; GFX9-NEXT: v_min_f64 v[20:21], v[20:21], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:92 +; GFX9-NEXT: v_cndmask_b32_e64 v20, v20, 0, s[22:23] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[24:25], v[22:23], v[31:32] +; GFX9-NEXT: v_min_f64 v[22:23], v[22:23], v[31:32] +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:100 +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104 +; GFX9-NEXT: v_cndmask_b32_e64 v22, v22, 0, s[24:25] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[26:27], v[24:25], v[31:32] +; GFX9-NEXT: v_min_f64 v[24:25], v[24:25], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:108 +; GFX9-NEXT: v_cndmask_b32_e64 v24, v24, 0, s[26:27] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[28:29], v[26:27], v[31:32] +; GFX9-NEXT: v_min_f64 v[26:27], v[26:27], v[31:32] +; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120 +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 +; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, 0, s[28:29] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[30:31], v[28:29], v[31:32] +; GFX9-NEXT: v_min_f64 v[28:29], v[28:29], v[31:32] ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:128 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124 -; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v34, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 32 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[4:5], v[24:25], v[37:38] -; GFX9-NEXT: v_cmp_o_f64_e64 s[6:7], v[24:25], v[37:38] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v38, v25, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v37, v24, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v24, v34, v24, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v25, v35, v25, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[37:38], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[4:5], 0, v[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v24, v24, v37, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v25, v25, v38, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v24, v34, v24, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v25, v35, v25, s[4:5] -; GFX9-NEXT: s_waitcnt vmcnt(5) -; GFX9-NEXT: v_cmp_lt_f64_e64 s[6:7], v[26:27], v[48:49] -; GFX9-NEXT: v_cmp_o_f64_e64 s[8:9], v[26:27], v[48:49] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[48:49], 32 -; GFX9-NEXT: s_waitcnt vmcnt(3) -; GFX9-NEXT: v_cmp_o_f64_e64 s[10:11], v[28:29], v[50:51] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v49, v27, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v48, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[8:9] -; GFX9-NEXT: v_cmp_lt_f64_e64 s[8:9], v[28:29], v[50:51] -; GFX9-NEXT: v_cndmask_b32_e32 v26, v34, v26, vcc -; GFX9-NEXT: v_cmp_eq_f64_e64 s[6:7], 0, v[34:35] -; GFX9-NEXT: v_cndmask_b32_e32 v27, v35, v27, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v26, v26, v48, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v49, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v36, v51, v29, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v27, v35, v27, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, 0, s[30:31] +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_u_f64_e64 s[34:35], v[30:31], v[32:33] +; GFX9-NEXT: v_min_f64 v[30:31], v[30:31], v[32:33] +; GFX9-NEXT: v_mov_b32_e32 v32, 0x7ff80000 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v32, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v32, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v32, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v32, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v9, v9, v32, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v32, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, v32, s[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v15, v32, s[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v17, v17, v32, s[18:19] +; GFX9-NEXT: v_cndmask_b32_e64 v19, v19, v32, s[20:21] +; GFX9-NEXT: v_cndmask_b32_e64 v21, v21, v32, s[22:23] +; GFX9-NEXT: v_cndmask_b32_e64 v23, v23, v32, s[24:25] +; GFX9-NEXT: v_cndmask_b32_e64 v25, v25, v32, s[26:27] +; GFX9-NEXT: v_cndmask_b32_e64 v27, v27, v32, s[28:29] +; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v32, s[30:31] +; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v32, s[34:35] +; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, 0, s[34:35] +; GFX9-NEXT: v_readlane_b32 s35, v34, 3 +; GFX9-NEXT: v_readlane_b32 s34, v34, 2 +; GFX9-NEXT: v_readlane_b32 s31, v34, 1 +; GFX9-NEXT: v_readlane_b32 s30, v34, 0 +; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33] -; GFX9-NEXT: v_cmp_o_f64_e64 s[4:5], v[30:31], v[32:33] -; GFX9-NEXT: v_cndmask_b32_e64 v35, v39, v36, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v26, v34, v26, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v34, v50, v28, s[8:9] -; GFX9-NEXT: v_cmp_class_f64_e64 s[6:7], v[28:29], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[10:11] -; GFX9-NEXT: v_cmp_class_f64_e64 s[8:9], v[50:51], 32 -; GFX9-NEXT: v_cndmask_b32_e32 v36, v33, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v37, v39, v36, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v36, v32, v30, vcc -; GFX9-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 32 -; GFX9-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[4:5] -; GFX9-NEXT: v_cmp_class_f64_e64 s[4:5], v[32:33], 32 -; GFX9-NEXT: v_cmp_eq_f64_e64 s[10:11], 0, v[34:35] -; GFX9-NEXT: v_cmp_eq_f64_e64 s[12:13], 0, v[36:37] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v28, v50, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v30, v36, v30, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v31, v37, v31, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v29, v51, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v28, v34, v28, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v29, v35, v29, s[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v30, v36, v30, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e64 v31, v37, v31, s[12:13] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_minimum_v16f64: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_accvgpr_write_b32 a0, v40 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a1, v41 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a2, v42 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a3, v43 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a4, v44 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a5, v45 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a6, v46 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a7, v47 ; Reload Reuse -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:8 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:4 -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:16 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:12 -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:24 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:20 -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:32 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:28 +; GFX940-NEXT: v_accvgpr_write_b32 a1, v40 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a2, v41 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a3, v42 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a4, v43 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a5, v44 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a6, v45 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a7, v46 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a8, v47 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a9, v56 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a10, v57 ; Reload Reuse +; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:16 +; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:12 +; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:24 +; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:20 +; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:32 +; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:28 +; GFX940-NEXT: scratch_load_dword v57, off, s32 offset:8 +; GFX940-NEXT: scratch_load_dword v56, off, s32 offset:4 +; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:40 +; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:36 +; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:48 +; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:44 +; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:56 +; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:52 +; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 +; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 +; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:72 +; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:68 +; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:80 +; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:76 +; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 +; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 +; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:96 +; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:92 ; GFX940-NEXT: scratch_load_dword v31, off, s32 -; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:128 -; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:124 -; GFX940-NEXT: scratch_load_dword v35, off, s32 offset:120 -; GFX940-NEXT: scratch_load_dword v34, off, s32 offset:116 -; GFX940-NEXT: scratch_load_dword v43, off, s32 offset:40 -; GFX940-NEXT: scratch_load_dword v42, off, s32 offset:36 +; GFX940-NEXT: scratch_load_dword v33, off, s32 offset:104 +; GFX940-NEXT: scratch_load_dword v32, off, s32 offset:100 +; GFX940-NEXT: v_accvgpr_write_b32 a11, v58 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a12, v59 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a13, v60 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a14, v61 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a15, v62 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_write_b32 a16, v63 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_min_f64 v[58:59], v[2:3], v[36:37] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[36:37] ; GFX940-NEXT: scratch_load_dword v37, off, s32 offset:112 ; GFX940-NEXT: scratch_load_dword v36, off, s32 offset:108 -; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:104 -; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:100 -; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:96 -; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:92 -; GFX940-NEXT: scratch_load_dword v53, off, s32 offset:56 -; GFX940-NEXT: scratch_load_dword v52, off, s32 offset:52 -; GFX940-NEXT: scratch_load_dword v55, off, s32 offset:48 -; GFX940-NEXT: scratch_load_dword v54, off, s32 offset:44 -; GFX940-NEXT: v_accvgpr_write_b32 a8, v56 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a9, v57 ; Reload Reuse -; GFX940-NEXT: v_mov_b32_e32 v56, 0x7ff80000 -; GFX940-NEXT: v_accvgpr_write_b32 a11, v59 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a10, v58 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a12, v60 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_write_b32 a13, v61 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_min_f64 v[60:61], v[4:5], v[38:39] +; GFX940-NEXT: v_cmp_u_f64_e64 s[0:1], v[4:5], v[38:39] +; GFX940-NEXT: scratch_load_dword v39, off, s32 offset:120 +; GFX940-NEXT: scratch_load_dword v38, off, s32 offset:116 +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_min_f64 v[62:63], v[6:7], v[48:49] +; GFX940-NEXT: v_cmp_u_f64_e64 s[2:3], v[6:7], v[48:49] +; GFX940-NEXT: scratch_load_dword v49, off, s32 offset:128 +; GFX940-NEXT: scratch_load_dword v48, off, s32 offset:124 +; GFX940-NEXT: s_waitcnt vmcnt(25) +; GFX940-NEXT: v_min_f64 v[2:3], v[0:1], v[56:57] +; GFX940-NEXT: v_cmp_u_f64_e64 s[4:5], v[0:1], v[56:57] +; GFX940-NEXT: v_mov_b32_e32 v0, 0x7ff80000 ; GFX940-NEXT: s_waitcnt vmcnt(23) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[0:1], v[40:41] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v57, v41, v1, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[0:1], v[40:41] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v40, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v57, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[0:1], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[40:41], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v58, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v59, v1, vcc +; GFX940-NEXT: v_min_f64 v[56:57], v[8:9], v[46:47] +; GFX940-NEXT: v_cndmask_b32_e64 v1, v2, 0, s[4:5] +; GFX940-NEXT: v_accvgpr_write_b32 a0, v1 +; GFX940-NEXT: v_cndmask_b32_e64 v1, v3, v0, s[4:5] +; GFX940-NEXT: v_cndmask_b32_e64 v2, v58, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v59, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[46:47] ; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[2:3], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v0, v0, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v1, v1, v41, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v51, v3, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[2:3], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e32 v40, v50, v2, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[2:3], 32 -; GFX940-NEXT: v_cndmask_b32_e64 v61, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v60, 0, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v2, v60, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v61, v3, vcc -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[50:51], 32 -; GFX940-NEXT: scratch_load_dword v41, off, s32 offset:64 -; GFX940-NEXT: scratch_load_dword v40, off, s32 offset:60 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v2, v50, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v3, v51, vcc -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(21) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[4:5], v[44:45] -; GFX940-NEXT: scratch_load_dword v51, off, s32 offset:88 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v58, v0, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v1, v59, v1, vcc -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[60:61] +; GFX940-NEXT: v_min_f64 v[46:47], v[10:11], v[44:45] +; GFX940-NEXT: v_cndmask_b32_e64 v4, v60, 0, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v8, v56, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v9, v57, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[44:45] +; GFX940-NEXT: s_waitcnt vmcnt(19) +; GFX940-NEXT: v_min_f64 v[44:45], v[12:13], v[42:43] +; GFX940-NEXT: v_cndmask_b32_e64 v5, v61, v0, s[0:1] +; GFX940-NEXT: v_cndmask_b32_e64 v10, v46, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v11, v47, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[42:43] +; GFX940-NEXT: s_waitcnt vmcnt(17) +; GFX940-NEXT: v_min_f64 v[42:43], v[14:15], v[40:41] +; GFX940-NEXT: v_cndmask_b32_e64 v6, v62, 0, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v12, v44, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v13, v45, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[40:41] +; GFX940-NEXT: s_waitcnt vmcnt(15) +; GFX940-NEXT: v_min_f64 v[40:41], v[16:17], v[54:55] +; GFX940-NEXT: v_cndmask_b32_e64 v7, v63, v0, s[2:3] +; GFX940-NEXT: v_cndmask_b32_e64 v14, v42, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v15, v43, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[54:55] +; GFX940-NEXT: s_waitcnt vmcnt(13) +; GFX940-NEXT: v_min_f64 v[54:55], v[18:19], v[52:53] +; GFX940-NEXT: v_accvgpr_read_b32 v63, a16 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v16, v40, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v17, v41, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[52:53] +; GFX940-NEXT: s_waitcnt vmcnt(11) +; GFX940-NEXT: v_min_f64 v[52:53], v[20:21], v[50:51] +; GFX940-NEXT: v_accvgpr_read_b32 v62, a15 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v18, v54, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v19, v55, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[50:51] +; GFX940-NEXT: s_waitcnt vmcnt(9) +; GFX940-NEXT: v_min_f64 v[50:51], v[22:23], v[34:35] +; GFX940-NEXT: v_accvgpr_read_b32 v61, a14 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[34:35] +; GFX940-NEXT: s_waitcnt vmcnt(6) +; GFX940-NEXT: v_min_f64 v[34:35], v[24:25], v[32:33] +; GFX940-NEXT: v_accvgpr_read_b32 v60, a13 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v0, vcc +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[32:33] +; GFX940-NEXT: v_accvgpr_read_b32 v59, a12 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v58, a11 ; Reload Reuse +; GFX940-NEXT: v_cndmask_b32_e64 v24, v34, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v25, v35, v0, vcc +; GFX940-NEXT: v_accvgpr_read_b32 v57, a10 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v56, a9 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v47, a8 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v46, a7 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v45, a6 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v44, a5 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v43, a4 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v42, a3 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v41, a2 ; Reload Reuse +; GFX940-NEXT: v_accvgpr_read_b32 v40, a1 ; Reload Reuse +; GFX940-NEXT: s_waitcnt vmcnt(4) +; GFX940-NEXT: v_min_f64 v[32:33], v[26:27], v[36:37] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[36:37] ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v2, v60, v2, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v3, v61, v3, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[4:5], v[44:45] -; GFX940-NEXT: v_accvgpr_read_b32 v61, a13 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v60, a12 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e32 v50, v45, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v44, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[4:5], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[44:45], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v58, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v59, v5, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v4, v4, v44, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v5, v5, v45, s[0:1] -; GFX940-NEXT: scratch_load_dword v45, off, s32 offset:72 -; GFX940-NEXT: scratch_load_dword v44, off, s32 offset:68 -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(22) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[6:7], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e32 v4, v58, v4, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v5, v59, v5, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[6:7], v[46:47] +; GFX940-NEXT: v_cndmask_b32_e64 v26, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v27, v33, v0, vcc +; GFX940-NEXT: s_waitcnt vmcnt(2) +; GFX940-NEXT: v_min_f64 v[32:33], v[28:29], v[38:39] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[38:39] ; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v50, v47, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v46, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[6:7], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[46:47], 32 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v58, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v59, v7, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v6, v6, v46, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v7, v7, v47, s[0:1] -; GFX940-NEXT: scratch_load_dword v47, off, s32 offset:80 -; GFX940-NEXT: scratch_load_dword v46, off, s32 offset:76 -; GFX940-NEXT: scratch_load_dword v50, off, s32 offset:84 -; GFX940-NEXT: v_cmp_eq_f64_e32 vcc, 0, v[58:59] -; GFX940-NEXT: s_waitcnt vmcnt(18) -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[8:9], v[42:43] -; GFX940-NEXT: v_cndmask_b32_e32 v6, v58, v6, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v7, v59, v7, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[8:9], v[42:43] -; GFX940-NEXT: s_nop 1 -; GFX940-NEXT: v_cndmask_b32_e32 v57, v43, v9, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v59, v56, v57, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v57, v42, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v58, 0, v57, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[8:9], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[42:43], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[58:59] -; GFX940-NEXT: v_cndmask_b32_e32 v8, v58, v8, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v9, v59, v9, vcc -; GFX940-NEXT: s_waitcnt vmcnt(8) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[10:11], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v8, v42, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v9, v43, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v42, v55, v11, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[10:11], v[54:55] -; GFX940-NEXT: v_cndmask_b32_e64 v8, v58, v8, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v9, v59, v9, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v43, v56, v42, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v42, v54, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v42, 0, v42, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[10:11], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[54:55], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[42:43] -; GFX940-NEXT: v_cndmask_b32_e32 v10, v42, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v11, v43, v11, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[12:13], v[52:53] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v10, v54, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v11, v55, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v54, v53, v13, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[12:13], v[52:53] -; GFX940-NEXT: v_cndmask_b32_e64 v10, v42, v10, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v11, v43, v11, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v55, v56, v54, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v54, v52, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v54, 0, v54, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[12:13], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[52:53], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[54:55] -; GFX940-NEXT: v_cndmask_b32_e32 v12, v54, v12, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v13, v55, v13, vcc -; GFX940-NEXT: s_waitcnt vmcnt(6) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v12, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v13, v53, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v41, v15, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[14:15], v[40:41] -; GFX940-NEXT: v_cndmask_b32_e64 v12, v54, v12, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v13, v55, v13, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v40, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[14:15], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[40:41], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v14, v52, v14, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v14, v14, v40, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v15, v53, v15, vcc -; GFX940-NEXT: s_waitcnt vmcnt(3) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[16:17], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v14, v52, v14, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v15, v41, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v45, v17, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[16:17], v[44:45] -; GFX940-NEXT: v_cndmask_b32_e64 v15, v53, v15, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v59, a11 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v44, v16, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[16:17], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[44:45], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v16, v52, v16, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v16, v16, v44, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v17, v53, v17, vcc -; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[18:19], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v16, v52, v16, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v17, v45, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v47, v19, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[18:19], v[46:47] -; GFX940-NEXT: v_cndmask_b32_e64 v17, v53, v17, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v58, a10 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v46, v18, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[18:19], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[46:47], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v18, v52, v18, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v18, v18, v46, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v19, v53, v19, vcc +; GFX940-NEXT: v_cndmask_b32_e64 v28, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v29, v33, v0, vcc ; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[20:21], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v18, v52, v18, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v19, v19, v47, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v51, v21, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[20:21], v[50:51] -; GFX940-NEXT: v_cndmask_b32_e64 v19, v53, v19, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v57, a9 ; Reload Reuse -; GFX940-NEXT: v_cndmask_b32_e64 v53, v56, v52, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v52, v50, v20, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v52, 0, v52, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[20:21], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[50:51], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[52:53] -; GFX940-NEXT: v_cndmask_b32_e32 v20, v52, v20, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v21, v53, v21, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[22:23], v[48:49] -; GFX940-NEXT: v_cndmask_b32_e64 v20, v20, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v21, v21, v51, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v49, v23, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[22:23], v[48:49] -; GFX940-NEXT: v_cndmask_b32_e64 v20, v52, v20, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v21, v53, v21, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v51, v56, v50, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v50, v48, v22, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v50, 0, v50, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[22:23], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[48:49], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[50:51] -; GFX940-NEXT: v_cndmask_b32_e32 v22, v50, v22, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v23, v51, v23, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[24:25], v[38:39] -; GFX940-NEXT: v_cndmask_b32_e64 v22, v22, v48, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v23, v23, v49, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v48, v39, v25, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[24:25], v[38:39] -; GFX940-NEXT: v_cndmask_b32_e64 v22, v50, v22, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v23, v51, v23, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v49, v56, v48, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v48, v38, v24, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v48, 0, v48, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[24:25], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[38:39], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[48:49] -; GFX940-NEXT: v_cndmask_b32_e32 v24, v48, v24, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v25, v49, v25, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[26:27], v[36:37] -; GFX940-NEXT: v_cndmask_b32_e64 v24, v24, v38, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v25, v25, v39, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v38, v37, v27, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[26:27], v[36:37] -; GFX940-NEXT: v_cndmask_b32_e64 v24, v48, v24, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v25, v49, v25, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v39, v56, v38, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v38, v36, v26, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v38, 0, v38, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[26:27], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[36:37], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[38:39] -; GFX940-NEXT: v_cndmask_b32_e32 v26, v38, v26, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v27, v39, v27, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[28:29], v[34:35] -; GFX940-NEXT: v_cndmask_b32_e64 v26, v26, v36, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v27, v27, v37, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v36, v35, v29, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[28:29], v[34:35] -; GFX940-NEXT: v_cndmask_b32_e64 v26, v38, v26, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v27, v39, v27, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v37, v56, v36, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v36, v34, v28, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v36, 0, v36, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[28:29], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[34:35], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[36:37] -; GFX940-NEXT: v_cndmask_b32_e32 v28, v36, v28, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v29, v37, v29, vcc -; GFX940-NEXT: v_cmp_lt_f64_e32 vcc, v[30:31], v[32:33] -; GFX940-NEXT: v_cndmask_b32_e64 v28, v28, v34, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v29, v29, v35, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v34, v33, v31, vcc -; GFX940-NEXT: v_cmp_o_f64_e64 s[0:1], v[30:31], v[32:33] -; GFX940-NEXT: v_cndmask_b32_e64 v28, v36, v28, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v29, v37, v29, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v35, v56, v34, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e32 v34, v32, v30, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v34, 0, v34, s[0:1] -; GFX940-NEXT: v_cmp_class_f64_e64 vcc, v[30:31], 32 -; GFX940-NEXT: v_cmp_class_f64_e64 s[0:1], v[32:33], 32 -; GFX940-NEXT: v_cmp_eq_f64_e64 s[2:3], 0, v[34:35] -; GFX940-NEXT: v_cndmask_b32_e32 v30, v34, v30, vcc -; GFX940-NEXT: v_cndmask_b32_e32 v31, v35, v31, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v30, v30, v32, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v31, v31, v33, s[0:1] -; GFX940-NEXT: v_cndmask_b32_e64 v30, v34, v30, s[2:3] -; GFX940-NEXT: v_cndmask_b32_e64 v31, v35, v31, s[2:3] -; GFX940-NEXT: v_accvgpr_read_b32 v56, a8 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v47, a7 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v46, a6 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v45, a5 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v44, a4 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v43, a3 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v42, a2 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v41, a1 ; Reload Reuse -; GFX940-NEXT: v_accvgpr_read_b32 v40, a0 ; Reload Reuse +; GFX940-NEXT: v_min_f64 v[32:33], v[30:31], v[48:49] +; GFX940-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[48:49] +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_cndmask_b32_e64 v30, v32, 0, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v31, v33, v0, vcc +; GFX940-NEXT: v_accvgpr_read_b32 v0, a0 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_minimum_v16f64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_clause 0x20 +; GFX10-NEXT: s_clause 0x19 +; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16 +; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:12 +; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:24 +; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20 +; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:32 +; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:28 +; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:36 +; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:68 +; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:64 +; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:60 +; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:56 +; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:52 +; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:48 +; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:44 +; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:40 ; GFX10-NEXT: buffer_load_dword v65, off, s[0:3], s32 offset:8 ; GFX10-NEXT: buffer_load_dword v64, off, s[0:3], s32 offset:4 -; GFX10-NEXT: buffer_load_dword v55, off, s[0:3], s32 offset:16 -; GFX10-NEXT: buffer_load_dword v54, off, s[0:3], s32 offset:12 -; GFX10-NEXT: buffer_load_dword v53, off, s[0:3], s32 offset:24 -; GFX10-NEXT: buffer_load_dword v52, off, s[0:3], s32 offset:20 -; GFX10-NEXT: buffer_load_dword v51, off, s[0:3], s32 offset:32 -; GFX10-NEXT: buffer_load_dword v50, off, s[0:3], s32 offset:28 -; GFX10-NEXT: buffer_load_dword v48, off, s[0:3], s32 offset:36 -; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:40 -; GFX10-NEXT: buffer_load_dword v37, off, s[0:3], s32 offset:56 -; GFX10-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:52 -; GFX10-NEXT: buffer_load_dword v39, off, s[0:3], s32 offset:48 -; GFX10-NEXT: buffer_load_dword v38, off, s[0:3], s32 offset:44 -; GFX10-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:64 -; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:60 -; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68 -; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72 -; GFX10-NEXT: buffer_load_dword v83, off, s[0:3], s32 offset:80 -; GFX10-NEXT: buffer_load_dword v82, off, s[0:3], s32 offset:76 -; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:88 -; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v66, off, s[0:3], s32 offset:100 ; GFX10-NEXT: buffer_load_dword v69, off, s[0:3], s32 offset:96 ; GFX10-NEXT: buffer_load_dword v68, off, s[0:3], s32 offset:92 -; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:100 -; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:104 -; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:112 -; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:108 -; GFX10-NEXT: buffer_load_dword v85, off, s[0:3], s32 offset:120 -; GFX10-NEXT: buffer_load_dword v84, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v71, off, s[0:3], s32 offset:88 +; GFX10-NEXT: buffer_load_dword v70, off, s[0:3], s32 offset:84 +; GFX10-NEXT: buffer_load_dword v81, off, s[0:3], s32 offset:80 +; GFX10-NEXT: buffer_load_dword v80, off, s[0:3], s32 offset:76 +; GFX10-NEXT: buffer_load_dword v49, off, s[0:3], s32 offset:72 +; GFX10-NEXT: buffer_load_dword v67, off, s[0:3], s32 offset:104 +; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: v_min_f64 v[82:83], v[2:3], v[31:32] +; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[2:3], v[31:32] +; GFX10-NEXT: s_waitcnt vmcnt(22) +; GFX10-NEXT: v_min_f64 v[84:85], v[4:5], v[33:34] +; GFX10-NEXT: v_cmp_u_f64_e64 s4, v[4:5], v[33:34] +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:120 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:116 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:112 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:108 +; GFX10-NEXT: s_waitcnt vmcnt(24) +; GFX10-NEXT: v_min_f64 v[32:33], v[6:7], v[35:36] +; GFX10-NEXT: v_cmp_u_f64_e64 s5, v[6:7], v[35:36] +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: buffer_load_dword v31, off, s[0:3], s32 -; GFX10-NEXT: buffer_load_dword v87, off, s[0:3], s32 offset:128 -; GFX10-NEXT: buffer_load_dword v86, off, s[0:3], s32 offset:124 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[0:1], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[2:3], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[10:11], 32 -; GFX10-NEXT: s_waitcnt vmcnt(31) -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[0:1], v[64:65] -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[0:1], v[64:65] -; GFX10-NEXT: s_waitcnt vmcnt(29) -; GFX10-NEXT: v_cmp_lt_f64_e64 s5, v[2:3], v[54:55] -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[2:3], v[54:55] -; GFX10-NEXT: s_waitcnt vmcnt(27) -; GFX10-NEXT: v_cmp_lt_f64_e64 s7, v[4:5], v[52:53] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[4:5], v[52:53] -; GFX10-NEXT: s_waitcnt vmcnt(25) -; GFX10-NEXT: v_cmp_lt_f64_e64 s9, v[6:7], v[50:51] -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[6:7], v[50:51] +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:128 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:124 ; GFX10-NEXT: s_waitcnt vmcnt(23) -; GFX10-NEXT: v_cmp_lt_f64_e64 s13, v[8:9], v[48:49] -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[64:65], 32 +; GFX10-NEXT: v_cmp_u_f64_e64 s10, v[14:15], v[50:51] ; GFX10-NEXT: s_waitcnt vmcnt(21) -; GFX10-NEXT: v_cmp_lt_f64_e64 s15, v[12:13], v[36:37] -; GFX10-NEXT: s_waitcnt vmcnt(17) -; GFX10-NEXT: v_cmp_o_f64_e64 s16, v[14:15], v[34:35] -; GFX10-NEXT: v_cndmask_b32_e32 v96, v64, v0, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v97, v54, v2, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v99, v55, v3, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v100, v52, v4, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v96, 0, v96, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v101, v50, v6, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v98, 0, v97, s6 -; GFX10-NEXT: v_cndmask_b32_e32 v97, v65, v1, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[54:55], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v96, v0, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v99, 0x7ff80000, v99, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v98, v2, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v97, 0x7ff80000, v97, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v100, 0, v100, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v102, 0, v101, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v99, v3, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[6:7], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v97, v1, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[4:5], 32 -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[10:11], v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v112, v48, v8, s13 -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[12:13], v[36:37] -; GFX10-NEXT: v_cmp_lt_f64_e64 s6, v[14:15], v[34:35] -; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v64, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v65, s14 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[52:53], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v113, v36, v12, s15 +; GFX10-NEXT: v_cmp_u_f64_e64 s9, v[12:13], v[52:53] +; GFX10-NEXT: s_waitcnt vmcnt(19) +; GFX10-NEXT: v_cmp_u_f64_e64 s7, v[10:11], v[54:55] +; GFX10-NEXT: s_waitcnt vmcnt(18) +; GFX10-NEXT: v_min_f64 v[34:35], v[8:9], v[37:38] +; GFX10-NEXT: v_cmp_u_f64_e64 s6, v[8:9], v[37:38] +; GFX10-NEXT: s_waitcnt vmcnt(16) +; GFX10-NEXT: v_min_f64 v[8:9], v[0:1], v[64:65] +; GFX10-NEXT: v_min_f64 v[36:37], v[10:11], v[54:55] +; GFX10-NEXT: v_cmp_u_f64_e64 s8, v[0:1], v[64:65] +; GFX10-NEXT: v_min_f64 v[38:39], v[12:13], v[52:53] +; GFX10-NEXT: v_min_f64 v[52:53], v[14:15], v[50:51] +; GFX10-NEXT: s_waitcnt vmcnt(11) +; GFX10-NEXT: v_min_f64 v[54:55], v[20:21], v[70:71] +; GFX10-NEXT: v_cmp_u_f64_e64 s13, v[20:21], v[70:71] +; GFX10-NEXT: s_waitcnt vmcnt(9) +; GFX10-NEXT: v_cmp_u_f64_e64 s12, v[18:19], v[80:81] +; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: v_min_f64 v[50:51], v[16:17], v[48:49] +; GFX10-NEXT: v_cmp_u_f64_e64 s11, v[16:17], v[48:49] +; GFX10-NEXT: v_min_f64 v[48:49], v[18:19], v[80:81] +; GFX10-NEXT: v_min_f64 v[64:65], v[22:23], v[68:69] +; GFX10-NEXT: v_cmp_u_f64_e64 s14, v[22:23], v[68:69] +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_min_f64 v[68:69], v[24:25], v[66:67] +; GFX10-NEXT: v_cmp_u_f64_e64 s15, v[24:25], v[66:67] +; GFX10-NEXT: v_cndmask_b32_e64 v10, v36, 0, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v8, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v9, 0x7ff80000, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v8, v34, 0, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v9, v35, 0x7ff80000, s6 +; GFX10-NEXT: v_cndmask_b32_e64 v11, v37, 0x7ff80000, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v12, v38, 0, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v13, v39, 0x7ff80000, s9 +; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, 0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, 0x7ff80000, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v16, v50, 0, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v17, v51, 0x7ff80000, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v18, v48, 0, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v19, v49, 0x7ff80000, s12 +; GFX10-NEXT: v_cndmask_b32_e64 v20, v54, 0, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v21, v55, 0x7ff80000, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v22, v64, 0, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v23, v65, 0x7ff80000, s14 +; GFX10-NEXT: v_cndmask_b32_e64 v24, v68, 0, s15 +; GFX10-NEXT: v_cndmask_b32_e64 v25, v69, 0x7ff80000, s15 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_min_f64 v[70:71], v[28:29], v[2:3] +; GFX10-NEXT: v_cmp_u_f64_e64 s17, v[28:29], v[2:3] +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_min_f64 v[66:67], v[26:27], v[4:5] +; GFX10-NEXT: v_cmp_u_f64_e64 s16, v[26:27], v[4:5] +; GFX10-NEXT: v_cndmask_b32_e64 v2, v82, 0, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_o_f64_e64 s18, v[30:31], v[86:87] -; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v54, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v54, v53, v5, s7 -; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v55, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[50:51], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v55, v51, v7, s9 -; GFX10-NEXT: v_cmp_o_f64_e64 s9, v[8:9], v[48:49] -; GFX10-NEXT: v_cndmask_b32_e64 v101, 0x7ff80000, v54, s8 -; GFX10-NEXT: v_cmp_lt_f64_e64 s7, v[16:17], v[32:33] -; GFX10-NEXT: v_cndmask_b32_e64 v6, v102, v6, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v103, 0x7ff80000, v55, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v100, v4, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v101, v5, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[8:9], 32 -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[10:11], v[38:39] -; GFX10-NEXT: v_cndmask_b32_e64 v7, v103, v7, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[48:49], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v38, v10, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v115, v34, v14, s6 -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[16:17], v[32:33] -; GFX10-NEXT: v_cndmask_b32_e64 v4, v4, v52, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, v53, s14 -; GFX10-NEXT: v_cmp_lt_f64_e64 s14, v[18:19], v[82:83] -; GFX10-NEXT: v_cndmask_b32_e64 v52, 0, v115, s16 -; GFX10-NEXT: v_cndmask_b32_e32 v6, v6, v50, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v50, v49, v9, s13 -; GFX10-NEXT: v_cndmask_b32_e32 v7, v7, v51, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[38:39], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v54, 0, v112, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v51, v39, v11, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v55, 0x7ff80000, v50, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v50, 0, v113, s5 -; GFX10-NEXT: v_cmp_o_f64_e64 s4, v[18:19], v[82:83] -; GFX10-NEXT: v_cndmask_b32_e64 v8, v54, v8, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v64, 0, v114, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v55, v9, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[12:13], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v65, 0x7ff80000, v51, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v8, v48, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v64, v10, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v9, v49, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[14:15], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v65, v11, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v48, v37, v13, s15 -; GFX10-NEXT: v_cmp_class_f64_e64 s17, v[34:35], 32 -; GFX10-NEXT: v_cmp_lt_f64_e64 s9, v[20:21], v[66:67] -; GFX10-NEXT: v_cmp_o_f64_e64 s11, v[20:21], v[66:67] -; GFX10-NEXT: v_cndmask_b32_e64 v116, v32, v16, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v51, 0x7ff80000, v48, s5 -; GFX10-NEXT: v_cndmask_b32_e32 v10, v10, v38, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v11, v11, v39, vcc_lo -; GFX10-NEXT: v_cmp_lt_f64_e32 vcc_lo, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v38, v35, v15, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v49, v82, v18, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v48, 0, v116, s8 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[36:37], 32 -; GFX10-NEXT: v_cmp_o_f64_e64 s5, v[22:23], v[68:69] -; GFX10-NEXT: v_cndmask_b32_e64 v53, 0x7ff80000, v38, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v50, v12, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s10 -; GFX10-NEXT: v_cmp_class_f64_e64 s10, v[16:17], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v38, 0, v49, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v112, v83, v19, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, v14, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, v15, s12 -; GFX10-NEXT: v_cmp_class_f64_e64 s12, v[32:33], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s14, v[18:19], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v67, v21, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v14, v34, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v15, v35, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v34, v33, v17, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v39, 0x7ff80000, v112, s4 -; GFX10-NEXT: v_cmp_lt_f64_e64 s4, v[24:25], v[70:71] -; GFX10-NEXT: v_cndmask_b32_e32 v113, v69, v23, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v35, v68, v22, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[20:21], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v49, 0x7ff80000, v34, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v34, 0x7ff80000, v114, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v12, v36, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v13, v37, s13 -; GFX10-NEXT: v_cmp_class_f64_e64 s13, v[82:83], 32 -; GFX10-NEXT: v_cmp_o_f64_e64 s6, v[24:25], v[70:71] -; GFX10-NEXT: v_cndmask_b32_e64 v16, v48, v16, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v49, v17, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v36, 0x7ff80000, v113, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v35, 0, v35, s5 -; GFX10-NEXT: v_cmp_lt_f64_e64 s7, v[26:27], v[80:81] -; GFX10-NEXT: v_cndmask_b32_e64 v16, v16, v32, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v66, v20, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v17, v33, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v38, v18, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v39, v19, s14 -; GFX10-NEXT: v_cmp_o_f64_e64 s15, v[26:27], v[80:81] -; GFX10-NEXT: v_cndmask_b32_e64 v33, 0, v32, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v112, v71, v25, s4 -; GFX10-NEXT: v_cmp_lt_f64_e64 s16, v[28:29], v[84:85] -; GFX10-NEXT: v_cmp_o_f64_e64 s8, v[28:29], v[84:85] -; GFX10-NEXT: v_cndmask_b32_e32 v21, v34, v21, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v20, v33, v20, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 32 -; GFX10-NEXT: v_cmp_lt_f64_e64 s17, v[30:31], v[86:87] -; GFX10-NEXT: v_cmp_class_f64_e64 s5, v[70:71], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v18, v82, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v82, v70, v24, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v19, v83, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v83, 0x7ff80000, v112, s6 -; GFX10-NEXT: v_cmp_class_f64_e64 s4, v[68:69], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s9, 0, v[96:97] -; GFX10-NEXT: v_cndmask_b32_e64 v82, 0, v82, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v37, v81, v27, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v80, v26, s7 -; GFX10-NEXT: v_cmp_class_f64_e64 s6, v[80:81], 32 -; GFX10-NEXT: v_cmp_class_f64_e64 s7, v[84:85], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s10, 0, v[98:99] -; GFX10-NEXT: v_cndmask_b32_e64 v113, 0x7ff80000, v37, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v112, 0, v32, s15 -; GFX10-NEXT: v_cmp_eq_f64_e64 s11, 0, v[100:101] -; GFX10-NEXT: v_cndmask_b32_e64 v115, v85, v29, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v114, v84, v28, s16 -; GFX10-NEXT: v_cmp_eq_f64_e64 s12, 0, v[102:103] -; GFX10-NEXT: v_cmp_eq_f64_e64 s13, 0, v[54:55] -; GFX10-NEXT: v_cndmask_b32_e32 v22, v35, v22, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v23, v36, v23, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[24:25], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v115, 0x7ff80000, v115, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v114, 0, v114, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v116, v87, v31, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v32, v86, v30, s17 -; GFX10-NEXT: v_cmp_class_f64_e64 s8, v[86:87], 32 -; GFX10-NEXT: v_cmp_eq_f64_e64 s14, 0, v[64:65] -; GFX10-NEXT: v_cmp_eq_f64_e64 s15, 0, v[50:51] -; GFX10-NEXT: v_cndmask_b32_e64 v117, 0x7ff80000, v116, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v116, 0, v32, s18 -; GFX10-NEXT: v_cmp_eq_f64_e64 s16, 0, v[52:53] -; GFX10-NEXT: v_cmp_eq_f64_e64 s17, 0, v[48:49] -; GFX10-NEXT: v_cmp_eq_f64_e64 s18, 0, v[38:39] -; GFX10-NEXT: v_cmp_eq_f64_e64 s19, 0, v[33:34] -; GFX10-NEXT: v_cmp_eq_f64_e64 s20, 0, v[35:36] -; GFX10-NEXT: v_cmp_eq_f64_e64 s21, 0, v[82:83] -; GFX10-NEXT: v_cmp_eq_f64_e64 s22, 0, v[112:113] -; GFX10-NEXT: v_cmp_eq_f64_e64 s23, 0, v[114:115] -; GFX10-NEXT: v_cmp_eq_f64_e64 s24, 0, v[116:117] -; GFX10-NEXT: v_cndmask_b32_e64 v22, v22, v68, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v23, v69, s4 -; GFX10-NEXT: v_cndmask_b32_e64 v0, v96, v0, s9 -; GFX10-NEXT: v_cndmask_b32_e32 v24, v82, v24, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v25, v83, v25, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[26:27], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v2, v98, v2, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v4, v100, v4, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v24, v70, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v25, v71, s5 -; GFX10-NEXT: v_cndmask_b32_e64 v6, v102, v6, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v8, v54, v8, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v10, v64, v10, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v12, v50, v12, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v14, v52, v14, s16 -; GFX10-NEXT: v_cndmask_b32_e64 v16, v48, v16, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v18, v38, v18, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v22, v35, v22, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v24, v82, v24, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v1, v97, v1, s9 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v99, v3, s10 -; GFX10-NEXT: v_cndmask_b32_e64 v5, v101, v5, s11 -; GFX10-NEXT: v_cndmask_b32_e64 v7, v103, v7, s12 -; GFX10-NEXT: v_cndmask_b32_e64 v9, v55, v9, s13 -; GFX10-NEXT: v_cndmask_b32_e64 v11, v65, v11, s14 -; GFX10-NEXT: v_cndmask_b32_e64 v13, v51, v13, s15 -; GFX10-NEXT: v_cndmask_b32_e64 v15, v53, v15, s16 -; GFX10-NEXT: v_cndmask_b32_e32 v26, v112, v26, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v27, v113, v27, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[28:29], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v17, v49, v17, s17 -; GFX10-NEXT: v_cndmask_b32_e64 v19, v39, v19, s18 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v26, v80, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v27, v81, s6 -; GFX10-NEXT: v_cndmask_b32_e64 v23, v36, v23, s20 -; GFX10-NEXT: v_cndmask_b32_e64 v25, v83, v25, s21 -; GFX10-NEXT: v_cndmask_b32_e64 v26, v112, v26, s22 -; GFX10-NEXT: v_cndmask_b32_e64 v27, v113, v27, s22 -; GFX10-NEXT: v_cndmask_b32_e32 v28, v114, v28, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v29, v115, v29, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[30:31], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v28, v84, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v29, v85, s7 -; GFX10-NEXT: v_cndmask_b32_e64 v28, v114, v28, s23 -; GFX10-NEXT: v_cndmask_b32_e64 v29, v115, v29, s23 -; GFX10-NEXT: v_cndmask_b32_e32 v30, v116, v30, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v31, v117, v31, vcc_lo -; GFX10-NEXT: v_cmp_class_f64_e64 vcc_lo, v[66:67], 32 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v30, v86, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v31, v87, s8 -; GFX10-NEXT: v_cndmask_b32_e64 v30, v116, v30, s24 -; GFX10-NEXT: v_cndmask_b32_e64 v31, v117, v31, s24 -; GFX10-NEXT: v_cndmask_b32_e32 v20, v20, v66, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v21, v21, v67, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e64 v20, v33, v20, s19 -; GFX10-NEXT: v_cndmask_b32_e64 v21, v34, v21, s19 +; GFX10-NEXT: v_min_f64 v[80:81], v[30:31], v[6:7] +; GFX10-NEXT: v_cmp_u_f64_e64 s18, v[30:31], v[6:7] +; GFX10-NEXT: v_cndmask_b32_e64 v3, v83, 0x7ff80000, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v84, 0, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v85, 0x7ff80000, s4 +; GFX10-NEXT: v_cndmask_b32_e64 v6, v32, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v7, v33, 0x7ff80000, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v28, v70, 0, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v29, v71, 0x7ff80000, s17 +; GFX10-NEXT: v_cndmask_b32_e64 v26, v66, 0, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v27, v67, 0x7ff80000, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v30, v80, 0, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v31, v81, 0x7ff80000, s18 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_minimum_v16f64: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:116 ; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:128 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:124 -; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_cmp_lt_f64_e64 s9, v[0:1], v[86:87] -; GFX11-NEXT: v_cmp_o_f64_e64 s11, v[0:1], v[86:87] -; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_cmp_lt_f64_e64 s10, v[2:3], v[84:85] -; GFX11-NEXT: v_cmp_class_f64_e64 s14, v[86:87], 32 -; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_cmp_lt_f64_e64 s0, v[4:5], v[32:33] -; GFX11-NEXT: v_cmp_o_f64_e32 vcc_lo, v[4:5], v[32:33] -; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_cmp_lt_f64_e64 s2, v[6:7], v[34:35] -; GFX11-NEXT: v_cmp_o_f64_e64 s12, v[2:3], v[84:85] -; GFX11-NEXT: v_cmp_o_f64_e64 s1, v[6:7], v[34:35] -; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_cmp_lt_f64_e64 s4, v[8:9], v[36:37] -; GFX11-NEXT: v_cmp_o_f64_e64 s3, v[8:9], v[36:37] -; GFX11-NEXT: v_cmp_class_f64_e64 s16, v[84:85], 32 -; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_cmp_lt_f64_e64 s6, v[10:11], v[38:39] -; GFX11-NEXT: v_cmp_o_f64_e64 s5, v[10:11], v[38:39] -; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_cmp_lt_f64_e64 s8, v[12:13], v[48:49] -; GFX11-NEXT: v_cmp_o_f64_e64 s7, v[12:13], v[48:49] -; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_cmp_lt_f64_e64 s13, v[14:15], v[50:51] -; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_cmp_o_f64_e64 s15, v[16:17], v[52:53] -; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_cmp_lt_f64_e64 s17, v[18:19], v[54:55] -; GFX11-NEXT: v_cmp_o_f64_e64 s18, v[18:19], v[54:55] -; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_cmp_lt_f64_e64 s19, v[20:21], v[64:65] -; GFX11-NEXT: v_cmp_o_f64_e64 s20, v[20:21], v[64:65] -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_cmp_lt_f64_e64 s21, v[22:23], v[66:67] -; GFX11-NEXT: v_cmp_o_f64_e64 s22, v[22:23], v[66:67] -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_cmp_lt_f64_e64 s23, v[24:25], v[68:69] -; GFX11-NEXT: v_cmp_o_f64_e64 s24, v[24:25], v[68:69] -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cmp_lt_f64_e64 s25, v[26:27], v[70:71] -; GFX11-NEXT: v_cmp_o_f64_e64 s26, v[26:27], v[70:71] -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_lt_f64_e64 s27, v[28:29], v[80:81] -; GFX11-NEXT: v_cmp_o_f64_e64 s28, v[28:29], v[80:81] +; GFX11-NEXT: scratch_load_b32 v33, off, s32 offset:8 +; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:4 +; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:16 +; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:12 +; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:24 +; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:20 +; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:32 +; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:28 +; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:40 +; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:36 +; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:48 +; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:44 +; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:56 +; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:52 +; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:64 +; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:60 +; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:72 +; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:68 +; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:80 +; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:76 +; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:88 +; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:96 +; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:92 +; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:104 +; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:100 +; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:112 +; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:108 +; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:120 +; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:116 +; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 +; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:124 +; GFX11-NEXT: s_waitcnt vmcnt(30) +; GFX11-NEXT: v_min_f64 v[96:97], v[0:1], v[32:33] +; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[32:33] +; GFX11-NEXT: s_waitcnt vmcnt(28) +; GFX11-NEXT: v_min_f64 v[32:33], v[2:3], v[34:35] +; GFX11-NEXT: v_cmp_u_f64_e64 s0, v[2:3], v[34:35] +; GFX11-NEXT: s_waitcnt vmcnt(26) +; GFX11-NEXT: v_min_f64 v[34:35], v[4:5], v[36:37] +; GFX11-NEXT: v_cmp_u_f64_e64 s1, v[4:5], v[36:37] +; GFX11-NEXT: s_waitcnt vmcnt(24) +; GFX11-NEXT: v_min_f64 v[36:37], v[6:7], v[38:39] +; GFX11-NEXT: v_cmp_u_f64_e64 s2, v[6:7], v[38:39] +; GFX11-NEXT: s_waitcnt vmcnt(22) +; GFX11-NEXT: v_min_f64 v[38:39], v[8:9], v[48:49] +; GFX11-NEXT: v_cmp_u_f64_e64 s3, v[8:9], v[48:49] +; GFX11-NEXT: s_waitcnt vmcnt(20) +; GFX11-NEXT: v_min_f64 v[48:49], v[10:11], v[50:51] +; GFX11-NEXT: v_cmp_u_f64_e64 s4, v[10:11], v[50:51] +; GFX11-NEXT: s_waitcnt vmcnt(18) +; GFX11-NEXT: v_min_f64 v[50:51], v[12:13], v[52:53] +; GFX11-NEXT: v_cmp_u_f64_e64 s5, v[12:13], v[52:53] +; GFX11-NEXT: s_waitcnt vmcnt(16) +; GFX11-NEXT: v_min_f64 v[52:53], v[14:15], v[54:55] +; GFX11-NEXT: v_cmp_u_f64_e64 s6, v[14:15], v[54:55] +; GFX11-NEXT: s_waitcnt vmcnt(14) +; GFX11-NEXT: v_min_f64 v[54:55], v[16:17], v[64:65] +; GFX11-NEXT: v_cmp_u_f64_e64 s7, v[16:17], v[64:65] +; GFX11-NEXT: s_waitcnt vmcnt(12) +; GFX11-NEXT: v_min_f64 v[64:65], v[18:19], v[66:67] +; GFX11-NEXT: v_cmp_u_f64_e64 s8, v[18:19], v[66:67] +; GFX11-NEXT: s_waitcnt vmcnt(10) +; GFX11-NEXT: v_min_f64 v[66:67], v[20:21], v[68:69] +; GFX11-NEXT: v_cmp_u_f64_e64 s9, v[20:21], v[68:69] +; GFX11-NEXT: s_waitcnt vmcnt(8) +; GFX11-NEXT: v_min_f64 v[68:69], v[22:23], v[70:71] +; GFX11-NEXT: v_cmp_u_f64_e64 s10, v[22:23], v[70:71] +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_min_f64 v[70:71], v[24:25], v[80:81] +; GFX11-NEXT: v_cmp_u_f64_e64 s11, v[24:25], v[80:81] +; GFX11-NEXT: s_waitcnt vmcnt(4) +; GFX11-NEXT: v_min_f64 v[80:81], v[26:27], v[82:83] +; GFX11-NEXT: v_cmp_u_f64_e64 s12, v[26:27], v[82:83] +; GFX11-NEXT: s_waitcnt vmcnt(2) +; GFX11-NEXT: v_min_f64 v[82:83], v[28:29], v[84:85] +; GFX11-NEXT: v_cmp_u_f64_e64 s13, v[28:29], v[84:85] ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_lt_f64_e64 s29, v[30:31], v[82:83] -; GFX11-NEXT: v_cmp_o_f64_e64 vcc_hi, v[30:31], v[82:83] -; GFX11-NEXT: v_cndmask_b32_e64 v96, v87, v1, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v101, v86, v0, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v98, v85, v3, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v103, v84, v2, s10 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[0:1], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v97, 0x7ff80000, v96, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v96, 0, v101, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v100, v33, v5, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v102, v35, v7, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v99, 0x7ff80000, v98, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v98, 0, v103, s12 -; GFX11-NEXT: v_cmp_class_f64_e64 s11, v[2:3], 32 -; GFX11-NEXT: v_cndmask_b32_e32 v101, 0x7ff80000, v100, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v103, 0x7ff80000, v102, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v112, v37, v9, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v114, v39, v11, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v116, v49, v13, s8 -; GFX11-NEXT: v_cmp_o_f64_e64 s9, v[14:15], v[50:51] -; GFX11-NEXT: v_cndmask_b32_e64 v118, v51, v15, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v113, 0x7ff80000, v112, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v115, 0x7ff80000, v114, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v117, 0x7ff80000, v116, s7 -; GFX11-NEXT: v_cmp_lt_f64_e64 s12, v[16:17], v[52:53] -; GFX11-NEXT: v_cndmask_b32_e64 v130, v55, v19, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v132, v65, v21, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v134, v67, v23, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v144, v69, v25, s23 -; GFX11-NEXT: v_cndmask_b32_e64 v145, v71, v27, s25 -; GFX11-NEXT: v_cndmask_b32_e64 v131, 0x7ff80000, v130, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v133, 0x7ff80000, v132, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v135, 0x7ff80000, v134, s22 -; GFX11-NEXT: v_cndmask_b32_e64 v146, v81, v29, s27 -; GFX11-NEXT: v_cndmask_b32_e64 v148, v80, v28, s27 -; GFX11-NEXT: v_cndmask_b32_e64 v147, v83, v31, s29 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v147, 0x7ff80000, v147, vcc_hi -; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, v0, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, v1, s10 -; GFX11-NEXT: v_cmp_class_f64_e64 s10, v[36:37], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, v86, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v32, v4, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, v87, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v34, v6, s2 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v98, v2, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v99, v3, s11 -; GFX11-NEXT: v_cndmask_b32_e32 v100, 0, v86, vcc_lo -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[4:5], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v102, 0, v87, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, v84, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v36, v8, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v38, v10, s6 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v48, v12, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v119, 0x7ff80000, v118, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v128, v53, v17, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v112, 0, v84, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v114, 0, v86, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v116, 0, v87, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v50, v14, s13 -; GFX11-NEXT: v_cndmask_b32_e64 v129, 0x7ff80000, v128, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v52, v16, s12 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v54, v18, s17 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, v85, s16 -; GFX11-NEXT: v_cndmask_b32_e64 v118, 0, v84, s9 -; GFX11-NEXT: v_cndmask_b32_e64 v84, v64, v20, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v128, 0, v86, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v130, 0, v87, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v86, v66, v22, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v85, 0x7ff80000, v144, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v132, 0, v84, s20 -; GFX11-NEXT: v_cndmask_b32_e64 v87, v68, v24, s23 -; GFX11-NEXT: v_cndmask_b32_e64 v144, v70, v26, s25 -; GFX11-NEXT: v_cndmask_b32_e64 v134, 0, v86, s22 -; GFX11-NEXT: v_cmp_class_f64_e64 s0, v[68:69], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s1, v[70:71], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v84, 0, v87, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v87, 0x7ff80000, v145, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v86, 0, v144, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v145, 0x7ff80000, v146, s28 -; GFX11-NEXT: v_cndmask_b32_e64 v144, 0, v148, s28 -; GFX11-NEXT: v_cndmask_b32_e64 v146, v82, v30, s29 -; GFX11-NEXT: v_cmp_class_f64_e64 s2, v[80:81], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s3, v[82:83], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s6, v[32:33], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s8, v[34:35], 32 -; GFX11-NEXT: v_dual_cndmask_b32 v5, v101, v5 :: v_dual_cndmask_b32 v4, v100, v4 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[6:7], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v146, 0, v146, vcc_hi -; GFX11-NEXT: v_cmp_class_f64_e64 s12, v[38:39], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s14, v[48:49], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s16, v[50:51], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s18, v[52:53], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s20, v[54:55], 32 -; GFX11-NEXT: v_cmp_class_f64_e64 s21, v[64:65], 32 -; GFX11-NEXT: v_cmp_eq_f64_e64 s4, 0, v[96:97] -; GFX11-NEXT: v_cmp_eq_f64_e64 s5, 0, v[98:99] -; GFX11-NEXT: v_cmp_eq_f64_e64 s7, 0, v[100:101] -; GFX11-NEXT: v_cmp_eq_f64_e64 s9, 0, v[102:103] -; GFX11-NEXT: v_cmp_eq_f64_e64 s11, 0, v[112:113] -; GFX11-NEXT: v_cmp_eq_f64_e64 s13, 0, v[114:115] -; GFX11-NEXT: v_cmp_eq_f64_e64 s15, 0, v[116:117] -; GFX11-NEXT: v_cmp_eq_f64_e64 s17, 0, v[118:119] -; GFX11-NEXT: v_cmp_eq_f64_e64 s19, 0, v[128:129] -; GFX11-NEXT: v_cmp_eq_f64_e64 s22, 0, v[130:131] -; GFX11-NEXT: v_cmp_eq_f64_e64 s23, 0, v[132:133] -; GFX11-NEXT: v_cmp_eq_f64_e64 s24, 0, v[134:135] -; GFX11-NEXT: v_cmp_eq_f64_e64 s25, 0, v[84:85] -; GFX11-NEXT: v_cmp_eq_f64_e64 s26, 0, v[86:87] -; GFX11-NEXT: v_cmp_eq_f64_e64 s27, 0, v[144:145] -; GFX11-NEXT: v_cmp_eq_f64_e64 s28, 0, v[146:147] -; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, v33, s6 -; GFX11-NEXT: v_dual_cndmask_b32 v7, v103, v7 :: v_dual_cndmask_b32 v6, v102, v6 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[8:9], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, v32, s6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, v35, s8 -; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, v0, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v2, v98, v2, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v4, v100, v4, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, v1, s4 -; GFX11-NEXT: v_cndmask_b32_e64 v3, v99, v3, s5 -; GFX11-NEXT: v_cndmask_b32_e64 v5, v101, v5, s7 -; GFX11-NEXT: v_cndmask_b32_e64 v7, v103, v7, s9 -; GFX11-NEXT: v_dual_cndmask_b32 v9, v113, v9 :: v_dual_cndmask_b32 v8, v112, v8 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[10:11], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, v34, s8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v9, v37, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v6, v102, v6, s9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v9, v113, v9, s11 -; GFX11-NEXT: v_dual_cndmask_b32 v11, v115, v11 :: v_dual_cndmask_b32 v10, v114, v10 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[12:13], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v8, v8, v36, s10 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v11, v39, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v8, v112, v8, s11 -; GFX11-NEXT: v_cndmask_b32_e64 v11, v115, v11, s13 -; GFX11-NEXT: v_dual_cndmask_b32 v13, v117, v13 :: v_dual_cndmask_b32 v12, v116, v12 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[14:15], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v10, v38, s12 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v13, v49, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v10, v114, v10, s13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v13, v117, v13, s15 -; GFX11-NEXT: v_dual_cndmask_b32 v15, v119, v15 :: v_dual_cndmask_b32 v14, v118, v14 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[16:17], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v12, v12, v48, s14 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v15, v51, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v12, v116, v12, s15 -; GFX11-NEXT: v_cndmask_b32_e64 v15, v119, v15, s17 -; GFX11-NEXT: v_dual_cndmask_b32 v17, v129, v17 :: v_dual_cndmask_b32 v16, v128, v16 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[18:19], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v14, v50, s16 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v17, v17, v53, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v14, v118, v14, s17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v17, v129, v17, s19 -; GFX11-NEXT: v_dual_cndmask_b32 v19, v131, v19 :: v_dual_cndmask_b32 v18, v130, v18 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[20:21], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v16, v16, v52, s18 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v19, v55, s20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v16, v128, v16, s19 -; GFX11-NEXT: v_cndmask_b32_e64 v19, v131, v19, s22 -; GFX11-NEXT: v_dual_cndmask_b32 v21, v133, v21 :: v_dual_cndmask_b32 v20, v132, v20 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[22:23], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v18, v54, s20 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v21, v21, v65, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v18, v130, v18, s22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v21, v133, v21, s23 -; GFX11-NEXT: v_dual_cndmask_b32 v23, v135, v23 :: v_dual_cndmask_b32 v22, v134, v22 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[24:25], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v20, v64, s21 -; GFX11-NEXT: v_cndmask_b32_e64 v20, v132, v20, s23 -; GFX11-NEXT: v_dual_cndmask_b32 v25, v85, v25 :: v_dual_cndmask_b32 v24, v84, v24 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[26:27], 32 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e64 v25, v25, v69, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v25, v85, v25, s25 -; GFX11-NEXT: v_dual_cndmask_b32 v27, v87, v27 :: v_dual_cndmask_b32 v26, v86, v26 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[28:29], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v24, v68, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v27, v27, v71, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v24, v84, v24, s25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v27, v87, v27, s26 -; GFX11-NEXT: v_dual_cndmask_b32 v29, v145, v29 :: v_dual_cndmask_b32 v28, v144, v28 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[30:31], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v26, v26, v70, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v29, v29, v81, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v26, v86, v26, s26 -; GFX11-NEXT: v_cndmask_b32_e64 v29, v145, v29, s27 -; GFX11-NEXT: v_dual_cndmask_b32 v31, v147, v31 :: v_dual_cndmask_b32 v30, v146, v30 -; GFX11-NEXT: v_cmp_class_f64_e64 vcc_lo, v[66:67], 32 -; GFX11-NEXT: v_cndmask_b32_e64 v28, v28, v80, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v31, v31, v83, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v28, v144, v28, s27 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e64 v31, v147, v31, s28 -; GFX11-NEXT: v_dual_cndmask_b32 v23, v23, v67 :: v_dual_cndmask_b32 v22, v22, v66 -; GFX11-NEXT: v_cndmask_b32_e64 v30, v30, v82, s3 -; GFX11-NEXT: v_cndmask_b32_e64 v23, v135, v23, s24 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e64 v22, v134, v22, s24 -; GFX11-NEXT: v_cndmask_b32_e64 v30, v146, v30, s28 +; GFX11-NEXT: v_min_f64 v[84:85], v[30:31], v[86:87] +; GFX11-NEXT: v_cmp_u_f64_e64 s14, v[30:31], v[86:87] +; GFX11-NEXT: v_cndmask_b32_e64 v0, v96, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v97, 0x7ff80000, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v32, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v33, 0x7ff80000, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v34, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v35, 0x7ff80000, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v6, v36, 0, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v7, v37, 0x7ff80000, s2 +; GFX11-NEXT: v_cndmask_b32_e64 v8, v38, 0, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v9, v39, 0x7ff80000, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v10, v48, 0, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v11, v49, 0x7ff80000, s4 +; GFX11-NEXT: v_cndmask_b32_e64 v12, v50, 0, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v13, v51, 0x7ff80000, s5 +; GFX11-NEXT: v_cndmask_b32_e64 v14, v52, 0, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v15, v53, 0x7ff80000, s6 +; GFX11-NEXT: v_cndmask_b32_e64 v16, v54, 0, s7 +; GFX11-NEXT: v_cndmask_b32_e64 v17, v55, 0x7ff80000, s7 +; GFX11-NEXT: v_cndmask_b32_e64 v18, v64, 0, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v19, v65, 0x7ff80000, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v20, v66, 0, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v21, v67, 0x7ff80000, s9 +; GFX11-NEXT: v_cndmask_b32_e64 v22, v68, 0, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v23, v69, 0x7ff80000, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v24, v70, 0, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v25, v71, 0x7ff80000, s11 +; GFX11-NEXT: v_cndmask_b32_e64 v26, v80, 0, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v27, v81, 0x7ff80000, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v28, v82, 0, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v29, v83, 0x7ff80000, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v30, v84, 0, s14 +; GFX11-NEXT: v_cndmask_b32_e64 v31, v85, 0x7ff80000, s14 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v16f64: diff --git a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll index d87eb97..e0ccda1 100644 --- a/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll +++ b/llvm/test/CodeGen/AMDGPU/occupancy-levels.ll @@ -13,6 +13,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10W64,GFX1030,GFX1030W64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1151 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1151 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1100,GFX1100W64 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1152 < %s | FileCheck --check-prefixes=GCN,GFX1030,GFX1030W32 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1152 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefixes=GCN,GFX1030,GFX1030W64 %s ; GCN-LABEL: {{^}}max_occupancy: ; GFX9: ; Occupancy: 10 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir index 4ecce28..6dda1fe 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.mir @@ -282,3 +282,168 @@ body: | %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 FLAT_STORE_DWORD %13, %0.sub1, 0, 0, implicit $exec, implicit $flat_scr ... + +--- +# GCN-LABEL: name: diffoporder_add_global_atomic_cmpswap +# GFX9: GLOBAL_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 1000, 0, +# GFX9: GLOBAL_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, + +# GFX8: GLOBAL_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, +# GFX8: GLOBAL_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, + +name: diffoporder_add_global_atomic_cmpswap +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + GLOBAL_ATOMIC_CMPSWAP %6:vreg_64, %0:vreg_64, 0, 0, implicit $exec + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + GLOBAL_ATOMIC_CMPSWAP %13:vreg_64, %0:vreg_64, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: diffoporder_add_flat_atomic_cmpswap +# GFX9: FLAT_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 1000, 0, +# GFX9: FLAT_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, + +# GFX8: FLAT_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, +# GFX8: FLAT_ATOMIC_CMPSWAP %{{[0-9]+}}, %0, 0, 0, + +name: diffoporder_add_flat_atomic_cmpswap +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + FLAT_ATOMIC_CMPSWAP %6:vreg_64, %0:vreg_64, 0, 0, implicit $exec, implicit $flat_scr + + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + FLAT_ATOMIC_CMPSWAP %13:vreg_64, %0:vreg_64, 0, 0, implicit $exec, implicit $flat_scr +... + +--- +# GCN-LABEL: name: diffoporder_add_global_atomic_add +# GFX9: GLOBAL_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 1000, 0, +# GFX9: GLOBAL_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, + +# GFX8: GLOBAL_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, +# GFX8: GLOBAL_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, + +name: diffoporder_add_global_atomic_add +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + GLOBAL_ATOMIC_ADD %6:vreg_64, %0.sub0, 0, 0, implicit $exec + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + GLOBAL_ATOMIC_ADD %13:vreg_64, %0.sub0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: diffoporder_add_flat_atomic_add +# GFX9: FLAT_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 1000, 0, +# GFX9: FLAT_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, + +# GFX8: FLAT_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, +# GFX8: FLAT_ATOMIC_ADD %{{[0-9]+}}, %0.sub0, 0, 0, + +name: diffoporder_add_flat_atomic_add +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + FLAT_ATOMIC_ADD %6:vreg_64, %0.sub0, 0, 0, implicit $exec, implicit $flat_scr + + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + FLAT_ATOMIC_ADD %13:vreg_64, %0.sub0, 0, 0, implicit $exec, implicit $flat_scr +... + +--- +# GCN-LABEL: name: diffoporder_add_global_atomic_add_rtn +# GFX9: GLOBAL_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 1000, 0, +# GFX9: GLOBAL_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, + +# GFX8: GLOBAL_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, +# GFX8: GLOBAL_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, + +name: diffoporder_add_global_atomic_add_rtn +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + %14:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %6:vreg_64, %0.sub0, 0, 0, implicit $exec + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + %15:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN %13:vreg_64, %0.sub0, 0, 0, implicit $exec +... + +--- +# GCN-LABEL: name: diffoporder_add_flat_atomic_add_rtn +# GFX9: FLAT_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 1000, 0, +# GFX9: FLAT_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, + +# GFX8: FLAT_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, +# GFX8: FLAT_ATOMIC_ADD_RTN %{{[0-9]+}}, %0.sub0, 0, 0, + +name: diffoporder_add_flat_atomic_add_rtn +body: | + bb.0.entry: + + %0:vreg_64 = COPY $vgpr0_vgpr1 + + %1:sgpr_32 = S_MOV_B32 4000 + %2:vgpr_32, %3:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %1, 0, implicit $exec + %4:vgpr_32, dead %5:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %3, 0, implicit $exec + %6:vreg_64 = REG_SEQUENCE %2, %subreg.sub0, %4, %subreg.sub1 + %14:vgpr_32 = FLAT_ATOMIC_ADD_RTN %6:vreg_64, %0.sub0, 0, 0, implicit $exec, implicit $flat_scr + + + %8:sgpr_32 = S_MOV_B32 3000 + %9:vgpr_32, %10:sreg_64_xexec = V_ADD_CO_U32_e64 %0.sub0, %8, 0, implicit $exec + %11:vgpr_32, dead %12:sreg_64_xexec = V_ADDC_U32_e64 %0.sub1, 0, %10, 0, implicit $exec + %13:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %11, %subreg.sub1 + %15:vgpr_32 = FLAT_ATOMIC_ADD_RTN %13:vreg_64, %0.sub0, 0, 0, implicit $exec, implicit $flat_scr +... diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll index fc00937..721114e 100644 --- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll @@ -43,25 +43,6 @@ define i32 @test_tail_call(ptr addrspace(1) %out, ptr addrspace(1) %in) { ret i32 %c } -declare void @external.varargs(i32, double, i64, ...) - -; GCN: error: <unknown>:0:0: in function test_call_varargs void (): unsupported call to variadic function external.varargs -; R600: in function test_call_varargs{{.*}}: unsupported call to function external.varargs -define void @test_call_varargs() { - call void (i32, double, i64, ...) @external.varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0) - ret void -} - -declare i32 @extern_variadic(...) - -; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported required tail call to function extern_variadic -; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic -define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) { - %add = fadd <4 x float> %arg0, %arg1 - %call = tail call i32 @extern_variadic(<4 x float> %add) - ret i32 %call -} - ; R600: in function test_c_call{{.*}}: unsupported call to function defined_function define amdgpu_ps i32 @test_c_call_from_shader() { %call = call i32 @defined_function(i32 0) diff --git a/llvm/test/CodeGen/ARM/neon_vabd.ll b/llvm/test/CodeGen/ARM/neon_vabd.ll new file mode 100644 index 0000000..14ad1a1 --- /dev/null +++ b/llvm/test/CodeGen/ARM/neon_vabd.ll @@ -0,0 +1,890 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s + +; +; SABD +; + +define <8 x i8> @sabd_8b(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sabd_8b: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s8 q8, d17, d16 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmovn.i16 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <8 x i8> %a to <8 x i16> + %b.sext = sext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a.sext, %b.sext + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + %trunc = trunc <8 x i16> %abs to <8 x i8> + ret <8 x i8> %trunc +} + +define <16 x i8> @sabd_16b(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sabd_16b: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vsubl.s8 q10, d18, d17 +; CHECK-NEXT: vsubl.s8 q8, d19, d16 +; CHECK-NEXT: vabs.s16 q9, q10 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmovn.i16 d19, q9 +; CHECK-NEXT: vmovn.i16 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <16 x i8> %a to <16 x i16> + %b.sext = sext <16 x i8> %b to <16 x i16> + %sub = sub <16 x i16> %a.sext, %b.sext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <4 x i16> @sabd_4h(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: sabd_4h: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s16 q8, d17, d16 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <4 x i16> %a to <4 x i32> + %b.sext = sext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a.sext, %b.sext + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + %trunc = trunc <4 x i32> %abs to <4 x i16> + ret <4 x i16> %trunc +} + +define <4 x i16> @sabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: sabd_4h_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vshl.i16 d16, d16, #8 +; CHECK-NEXT: vshl.i16 d17, d17, #8 +; CHECK-NEXT: vshr.s16 d16, d16, #8 +; CHECK-NEXT: vshr.s16 d17, d17, #8 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vabs.s16 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <4 x i8> %a to <4 x i16> + %b.sext = sext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a.sext, %b.sext + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) + ret <4 x i16> %abs +} + +define <8 x i16> @sabd_8h(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: sabd_8h: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vsubl.s16 q10, d18, d17 +; CHECK-NEXT: vsubl.s16 q8, d19, d16 +; CHECK-NEXT: vabs.s32 q9, q10 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmovn.i32 d19, q9 +; CHECK-NEXT: vmovn.i32 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <8 x i16> %a to <8 x i32> + %b.sext = sext <8 x i16> %b to <8 x i32> + %sub = sub <8 x i32> %a.sext, %b.sext + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) + %trunc = trunc <8 x i32> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @sabd_8h_promoted_ops(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: sabd_8h_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s8 q8, d17, d16 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <8 x i8> %a to <8 x i16> + %b.sext = sext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a.sext, %b.sext + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + ret <8 x i16> %abs +} + +define <2 x i32> @sabd_2s(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: sabd_2s: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s32 q8, d17, d16 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <2 x i32> %a to <2 x i64> + %b.sext = sext <2 x i32> %b to <2 x i64> + %sub = sub <2 x i64> %a.sext, %b.sext + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + %trunc = trunc <2 x i64> %abs to <2 x i32> + ret <2 x i32> %trunc +} + +define <2 x i32> @sabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: sabd_2s_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vshl.i32 d16, d16, #16 +; CHECK-NEXT: vshl.i32 d17, d17, #16 +; CHECK-NEXT: vshr.s32 d16, d16, #16 +; CHECK-NEXT: vshr.s32 d17, d17, #16 +; CHECK-NEXT: vsub.i32 d16, d17, d16 +; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <2 x i16> %a to <2 x i32> + %b.sext = sext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a.sext, %b.sext + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) + ret <2 x i32> %abs +} + +define <4 x i32> @sabd_4s(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: sabd_4s: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vsubl.s32 q10, d18, d17 +; CHECK-NEXT: vsubl.s32 q8, d19, d16 +; CHECK-NEXT: vshr.s64 q9, q10, #63 +; CHECK-NEXT: vshr.s64 q11, q8, #63 +; CHECK-NEXT: veor q10, q10, q9 +; CHECK-NEXT: veor q8, q8, q11 +; CHECK-NEXT: vsub.i64 q9, q10, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q11 +; CHECK-NEXT: vmovn.i64 d19, q9 +; CHECK-NEXT: vmovn.i64 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <4 x i32> %a to <4 x i64> + %b.sext = sext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %a.sext, %b.sext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @sabd_4s_promoted_ops(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: sabd_4s_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s16 q8, d17, d16 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <4 x i16> %a to <4 x i32> + %b.sext = sext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a.sext, %b.sext + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + ret <4 x i32> %abs +} + +define <2 x i64> @sabd_2d(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: sabd_2d: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: add r12, sp, #24 +; CHECK-NEXT: asr r6, r3, #31 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov r12, lr, d17 +; CHECK-NEXT: vmov r7, r5, d16 +; CHECK-NEXT: subs r2, r2, r12 +; CHECK-NEXT: sbcs r3, r3, lr +; CHECK-NEXT: sbcs r4, r6, lr, asr #31 +; CHECK-NEXT: sbc r6, r6, lr, asr #31 +; CHECK-NEXT: eor r2, r2, r6, asr #31 +; CHECK-NEXT: eor r3, r3, r6, asr #31 +; CHECK-NEXT: subs r2, r2, r6, asr #31 +; CHECK-NEXT: sbc r3, r3, r6, asr #31 +; CHECK-NEXT: subs r0, r0, r7 +; CHECK-NEXT: asr r6, r1, #31 +; CHECK-NEXT: sbcs r1, r1, r5 +; CHECK-NEXT: sbcs r7, r6, r5, asr #31 +; CHECK-NEXT: vmov.32 d17[0], r2 +; CHECK-NEXT: sbc r7, r6, r5, asr #31 +; CHECK-NEXT: eor r0, r0, r7, asr #31 +; CHECK-NEXT: subs r0, r0, r7, asr #31 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: eor r0, r1, r7, asr #31 +; CHECK-NEXT: sbc r0, r0, r7, asr #31 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vmov.32 d16[1], r0 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: mov pc, lr + %a.sext = sext <2 x i64> %a to <2 x i128> + %b.sext = sext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %a.sext, %b.sext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +define <2 x i64> @sabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: sabd_2d_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.s32 q8, d17, d16 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.sext = sext <2 x i32> %a to <2 x i64> + %b.sext = sext <2 x i32> %b to <2 x i64> + %sub = sub <2 x i64> %a.sext, %b.sext + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + ret <2 x i64> %abs +} + +; +; UABD +; + +define <8 x i8> @uabd_8b(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: uabd_8b: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u8 q8, d17, d16 +; CHECK-NEXT: vmovn.i16 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <8 x i8> %a to <8 x i16> + %b.zext = zext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a.zext, %b.zext + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + %trunc = trunc <8 x i16> %abs to <8 x i8> + ret <8 x i8> %trunc +} + +define <16 x i8> @uabd_16b(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: uabd_16b: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vabdl.u8 q10, d18, d17 +; CHECK-NEXT: vabdl.u8 q8, d19, d16 +; CHECK-NEXT: vmovn.i16 d19, q10 +; CHECK-NEXT: vmovn.i16 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <16 x i8> %a to <16 x i16> + %b.zext = zext <16 x i8> %b to <16 x i16> + %sub = sub <16 x i16> %a.zext, %b.zext + %abs = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %sub, i1 true) + %trunc = trunc <16 x i16> %abs to <16 x i8> + ret <16 x i8> %trunc +} + +define <4 x i16> @uabd_4h(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: uabd_4h: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u16 q8, d17, d16 +; CHECK-NEXT: vmovn.i32 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <4 x i16> %a to <4 x i32> + %b.zext = zext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a.zext, %b.zext + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + %trunc = trunc <4 x i32> %abs to <4 x i16> + ret <4 x i16> %trunc +} + +define <4 x i16> @uabd_4h_promoted_ops(<4 x i8> %a, <4 x i8> %b) { +; CHECK-LABEL: uabd_4h_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vbic.i16 d16, #0xff00 +; CHECK-NEXT: vbic.i16 d17, #0xff00 +; CHECK-NEXT: vsub.i16 d16, d17, d16 +; CHECK-NEXT: vabs.s16 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <4 x i8> %a to <4 x i16> + %b.zext = zext <4 x i8> %b to <4 x i16> + %sub = sub <4 x i16> %a.zext, %b.zext + %abs = call <4 x i16> @llvm.abs.v4i16(<4 x i16> %sub, i1 true) + ret <4 x i16> %abs +} + +define <8 x i16> @uabd_8h(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: uabd_8h: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vabdl.u16 q10, d18, d17 +; CHECK-NEXT: vabdl.u16 q8, d19, d16 +; CHECK-NEXT: vmovn.i32 d19, q10 +; CHECK-NEXT: vmovn.i32 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <8 x i16> %a to <8 x i32> + %b.zext = zext <8 x i16> %b to <8 x i32> + %sub = sub <8 x i32> %a.zext, %b.zext + %abs = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %sub, i1 true) + %trunc = trunc <8 x i32> %abs to <8 x i16> + ret <8 x i16> %trunc +} + +define <8 x i16> @uabd_8h_promoted_ops(<8 x i8> %a, <8 x i8> %b) { +; CHECK-LABEL: uabd_8h_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u8 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <8 x i8> %a to <8 x i16> + %b.zext = zext <8 x i8> %b to <8 x i16> + %sub = sub <8 x i16> %a.zext, %b.zext + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + ret <8 x i16> %abs +} + +define <2 x i32> @uabd_2s(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: uabd_2s: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.u32 q8, d17, d16 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmovn.i64 d16, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <2 x i32> %a to <2 x i64> + %b.zext = zext <2 x i32> %b to <2 x i64> + %sub = sub <2 x i64> %a.zext, %b.zext + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + %trunc = trunc <2 x i64> %abs to <2 x i32> + ret <2 x i32> %trunc +} + +define <2 x i32> @uabd_2s_promoted_ops(<2 x i16> %a, <2 x i16> %b) { +; CHECK-LABEL: uabd_2s_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 d16, #0xffff +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d18, r0, r1 +; CHECK-NEXT: vand d17, d17, d16 +; CHECK-NEXT: vand d16, d18, d16 +; CHECK-NEXT: vsub.i32 d16, d16, d17 +; CHECK-NEXT: vabs.s32 d16, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <2 x i16> %a to <2 x i32> + %b.zext = zext <2 x i16> %b to <2 x i32> + %sub = sub <2 x i32> %a.zext, %b.zext + %abs = call <2 x i32> @llvm.abs.v2i32(<2 x i32> %sub, i1 true) + ret <2 x i32> %abs +} + +define <4 x i32> @uabd_4s(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: uabd_4s: +; CHECK: @ %bb.0: +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov d18, r2, r3 +; CHECK-NEXT: vmov d19, r0, r1 +; CHECK-NEXT: vsubl.u32 q10, d18, d17 +; CHECK-NEXT: vsubl.u32 q8, d19, d16 +; CHECK-NEXT: vshr.s64 q9, q10, #63 +; CHECK-NEXT: vshr.s64 q11, q8, #63 +; CHECK-NEXT: veor q10, q10, q9 +; CHECK-NEXT: veor q8, q8, q11 +; CHECK-NEXT: vsub.i64 q9, q10, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q11 +; CHECK-NEXT: vmovn.i64 d19, q9 +; CHECK-NEXT: vmovn.i64 d18, q8 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <4 x i32> %a to <4 x i64> + %b.zext = zext <4 x i32> %b to <4 x i64> + %sub = sub <4 x i64> %a.zext, %b.zext + %abs = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %sub, i1 true) + %trunc = trunc <4 x i64> %abs to <4 x i32> + ret <4 x i32> %trunc +} + +define <4 x i32> @uabd_4s_promoted_ops(<4 x i16> %a, <4 x i16> %b) { +; CHECK-LABEL: uabd_4s_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vabdl.u16 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <4 x i16> %a to <4 x i32> + %b.zext = zext <4 x i16> %b to <4 x i32> + %sub = sub <4 x i32> %a.zext, %b.zext + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + ret <4 x i32> %abs +} + +define <2 x i64> @uabd_2d(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: uabd_2d: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: add r12, sp, #24 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: vld1.64 {d16, d17}, [r12] +; CHECK-NEXT: vmov r12, lr, d17 +; CHECK-NEXT: vmov r4, r7, d16 +; CHECK-NEXT: subs r2, r2, r12 +; CHECK-NEXT: sbcs r3, r3, lr +; CHECK-NEXT: sbcs r5, r6, #0 +; CHECK-NEXT: sbc r5, r6, #0 +; CHECK-NEXT: eor r2, r2, r5, asr #31 +; CHECK-NEXT: eor r3, r3, r5, asr #31 +; CHECK-NEXT: subs r2, r2, r5, asr #31 +; CHECK-NEXT: sbc r3, r3, r5, asr #31 +; CHECK-NEXT: subs r0, r0, r4 +; CHECK-NEXT: sbcs r1, r1, r7 +; CHECK-NEXT: vmov.32 d17[0], r2 +; CHECK-NEXT: sbcs r7, r6, #0 +; CHECK-NEXT: sbc r7, r6, #0 +; CHECK-NEXT: eor r0, r0, r7, asr #31 +; CHECK-NEXT: subs r0, r0, r7, asr #31 +; CHECK-NEXT: vmov.32 d16[0], r0 +; CHECK-NEXT: eor r0, r1, r7, asr #31 +; CHECK-NEXT: sbc r0, r0, r7, asr #31 +; CHECK-NEXT: vmov.32 d17[1], r3 +; CHECK-NEXT: vmov.32 d16[1], r0 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: pop {r4, r5, r6, r7, r11, lr} +; CHECK-NEXT: mov pc, lr + %a.zext = zext <2 x i64> %a to <2 x i128> + %b.zext = zext <2 x i64> %b to <2 x i128> + %sub = sub <2 x i128> %a.zext, %b.zext + %abs = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %sub, i1 true) + %trunc = trunc <2 x i128> %abs to <2 x i64> + ret <2 x i64> %trunc +} + +define <2 x i64> @uabd_2d_promoted_ops(<2 x i32> %a, <2 x i32> %b) { +; CHECK-LABEL: uabd_2d_promoted_ops: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov d17, r0, r1 +; CHECK-NEXT: vsubl.u32 q8, d17, d16 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a.zext = zext <2 x i32> %a to <2 x i64> + %b.zext = zext <2 x i32> %b to <2 x i64> + %sub = sub <2 x i64> %a.zext, %b.zext + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + ret <2 x i64> %abs +} + +define <16 x i8> @uabd_v16i8_nuw(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: uabd_v16i8_nuw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i8 q8, q8, q9 +; CHECK-NEXT: vabs.s8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nuw <16 x i8> %a, %b + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) + ret <16 x i8> %abs +} + +define <8 x i16> @uabd_v8i16_nuw(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: uabd_v8i16_nuw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i16 q8, q8, q9 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nuw <8 x i16> %a, %b + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + ret <8 x i16> %abs +} + +define <4 x i32> @uabd_v4i32_nuw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: uabd_v4i32_nuw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i32 q8, q8, q9 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nuw <4 x i32> %a, %b + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + ret <4 x i32> %abs +} + +define <2 x i64> @uabd_v2i64_nuw(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: uabd_v2i64_nuw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nuw <2 x i64> %a, %b + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + ret <2 x i64> %abs +} + +define <16 x i8> @sabd_v16i8_nsw(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: sabd_v16i8_nsw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i8 q8, q8, q9 +; CHECK-NEXT: vabs.s8 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nsw <16 x i8> %a, %b + %abs = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %sub, i1 true) + ret <16 x i8> %abs +} + +define <8 x i16> @sabd_v8i16_nsw(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: sabd_v8i16_nsw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i16 q8, q8, q9 +; CHECK-NEXT: vabs.s16 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nsw <8 x i16> %a, %b + %abs = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %sub, i1 true) + ret <8 x i16> %abs +} + +define <4 x i32> @sabd_v4i32_nsw(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: sabd_v4i32_nsw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i32 q8, q8, q9 +; CHECK-NEXT: vabs.s32 q8, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nsw <4 x i32> %a, %b + %abs = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %sub, i1 true) + ret <4 x i32> %abs +} + +define <2 x i64> @sabd_v2i64_nsw(<2 x i64> %a, <2 x i64> %b) { +; CHECK-LABEL: sabd_v2i64_nsw: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vshr.s64 q9, q8, #63 +; CHECK-NEXT: veor q8, q8, q9 +; CHECK-NEXT: vsub.i64 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %sub = sub nsw <2 x i64> %a, %b + %abs = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %sub, i1 true) + ret <2 x i64> %abs +} + +define <16 x i8> @smaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: smaxmin_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.s8 q10, q8, q9 +; CHECK-NEXT: vmax.s8 q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <16 x i8> @llvm.smax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.smin.v16i8(<16 x i8> %0, <16 x i8> %1) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} + +define <8 x i16> @smaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) { +; CHECK-LABEL: smaxmin_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.s16 q10, q8, q9 +; CHECK-NEXT: vmax.s16 q8, q8, q9 +; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <8 x i16> @llvm.smax.v8i16(<8 x i16> %0, <8 x i16> %1) + %b = tail call <8 x i16> @llvm.smin.v8i16(<8 x i16> %0, <8 x i16> %1) + %sub = sub <8 x i16> %a, %b + ret <8 x i16> %sub +} + +define <4 x i32> @smaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: smaxmin_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.s32 q10, q8, q9 +; CHECK-NEXT: vmax.s32 q8, q8, q9 +; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <4 x i32> @llvm.smax.v4i32(<4 x i32> %0, <4 x i32> %1) + %b = tail call <4 x i32> @llvm.smin.v4i32(<4 x i32> %0, <4 x i32> %1) + %sub = sub <4 x i32> %a, %b + ret <4 x i32> %sub +} + +define <2 x i64> @smaxmin_v2i64(<2 x i64> %0, <2 x i64> %1) { +; CHECK-LABEL: smaxmin_v2i64: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: add r6, sp, #24 +; CHECK-NEXT: mov r8, #0 +; CHECK-NEXT: vld1.64 {d18, d19}, [r6] +; CHECK-NEXT: vmov r7, r12, d19 +; CHECK-NEXT: vmov r4, lr, d18 +; CHECK-NEXT: subs r5, r2, r7 +; CHECK-NEXT: sbcs r5, r3, r12 +; CHECK-NEXT: mov r6, r7 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movne r6, r2 +; CHECK-NEXT: mov r5, r12 +; CHECK-NEXT: vmov.32 d17[0], r6 +; CHECK-NEXT: movne r5, r3 +; CHECK-NEXT: mov r6, r4 +; CHECK-NEXT: vmov.32 d17[1], r5 +; CHECK-NEXT: subs r5, r4, r0 +; CHECK-NEXT: sbcs r5, lr, r1 +; CHECK-NEXT: mov r5, #0 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movne r6, r0 +; CHECK-NEXT: vmov.32 d18[0], r6 +; CHECK-NEXT: subs r6, r7, r2 +; CHECK-NEXT: sbcs r6, r12, r3 +; CHECK-NEXT: mov r6, #0 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r7, r2 +; CHECK-NEXT: subs r2, r0, r4 +; CHECK-NEXT: sbcs r2, r1, lr +; CHECK-NEXT: vmov.32 d19[0], r7 +; CHECK-NEXT: movlt r8, #1 +; CHECK-NEXT: cmp r8, #0 +; CHECK-NEXT: movne r4, r0 +; CHECK-NEXT: mov r0, lr +; CHECK-NEXT: vmov.32 d16[0], r4 +; CHECK-NEXT: movne r0, r1 +; CHECK-NEXT: cmp r6, #0 +; CHECK-NEXT: movne r12, r3 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: vmov.32 d16[1], r0 +; CHECK-NEXT: movne lr, r1 +; CHECK-NEXT: vmov.32 d19[1], r12 +; CHECK-NEXT: vmov.32 d18[1], lr +; CHECK-NEXT: vsub.i64 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: pop {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: mov pc, lr + %a = tail call <2 x i64> @llvm.smax.v2i64(<2 x i64> %0, <2 x i64> %1) + %b = tail call <2 x i64> @llvm.smin.v2i64(<2 x i64> %0, <2 x i64> %1) + %sub = sub <2 x i64> %a, %b + ret <2 x i64> %sub +} + +define <16 x i8> @umaxmin_v16i8(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: umaxmin_v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.u8 q10, q8, q9 +; CHECK-NEXT: vmax.u8 q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %0, <16 x i8> %1) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} + +define <8 x i16> @umaxmin_v8i16(<8 x i16> %0, <8 x i16> %1) { +; CHECK-LABEL: umaxmin_v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.u16 q10, q8, q9 +; CHECK-NEXT: vmax.u16 q8, q8, q9 +; CHECK-NEXT: vsub.i16 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %0, <8 x i16> %1) + %b = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %0, <8 x i16> %1) + %sub = sub <8 x i16> %a, %b + ret <8 x i16> %sub +} + +define <4 x i32> @umaxmin_v4i32(<4 x i32> %0, <4 x i32> %1) { +; CHECK-LABEL: umaxmin_v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmin.u32 q10, q8, q9 +; CHECK-NEXT: vmax.u32 q8, q8, q9 +; CHECK-NEXT: vsub.i32 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <4 x i32> @llvm.umax.v4i32(<4 x i32> %0, <4 x i32> %1) + %b = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %0, <4 x i32> %1) + %sub = sub <4 x i32> %a, %b + ret <4 x i32> %sub +} + +define <2 x i64> @umaxmin_v2i64(<2 x i64> %0, <2 x i64> %1) { +; CHECK-LABEL: umaxmin_v2i64: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: mov r12, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r12] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vqsub.u64 q10, q8, q9 +; CHECK-NEXT: vqsub.u64 q9, q9, q8 +; CHECK-NEXT: vsub.i64 q10, q10, q8 +; CHECK-NEXT: vadd.i64 q8, q8, q9 +; CHECK-NEXT: vadd.i64 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <2 x i64> @llvm.umax.v2i64(<2 x i64> %0, <2 x i64> %1) + %b = tail call <2 x i64> @llvm.umin.v2i64(<2 x i64> %0, <2 x i64> %1) + %sub = sub <2 x i64> %a, %b + ret <2 x i64> %sub +} + +define <16 x i8> @umaxmin_v16i8_com1(<16 x i8> %0, <16 x i8> %1) { +; CHECK-LABEL: umaxmin_v16i8_com1: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: mov r0, sp +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vmin.u8 q10, q9, q8 +; CHECK-NEXT: vmax.u8 q8, q8, q9 +; CHECK-NEXT: vsub.i8 q8, q8, q10 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr + %a = tail call <16 x i8> @llvm.umax.v16i8(<16 x i8> %0, <16 x i8> %1) + %b = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %1, <16 x i8> %0) + %sub = sub <16 x i8> %a, %b + ret <16 x i8> %sub +} diff --git a/llvm/test/CodeGen/ARM/vaba.ll b/llvm/test/CodeGen/ARM/vaba.ll index e4a61ea..14419a3 100644 --- a/llvm/test/CodeGen/ARM/vaba.ll +++ b/llvm/test/CodeGen/ARM/vaba.ll @@ -1,8 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vabas8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabas8: -;CHECK: vaba.s8 +; CHECK-LABEL: vabas8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.s8 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C @@ -12,8 +19,14 @@ define <8 x i8> @vabas8(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i16> @vabas16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabas16: -;CHECK: vaba.s16 +; CHECK-LABEL: vabas16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.s16 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C @@ -23,8 +36,14 @@ define <4 x i16> @vabas16(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i32> @vabas32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabas32: -;CHECK: vaba.s32 +; CHECK-LABEL: vabas32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.s32 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C @@ -34,8 +53,14 @@ define <2 x i32> @vabas32(ptr %A, ptr %B, ptr %C) nounwind { } define <8 x i8> @vabau8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabau8: -;CHECK: vaba.u8 +; CHECK-LABEL: vabau8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.u8 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C @@ -45,8 +70,14 @@ define <8 x i8> @vabau8(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i16> @vabau16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabau16: -;CHECK: vaba.u16 +; CHECK-LABEL: vabau16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.u16 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C @@ -56,8 +87,14 @@ define <4 x i16> @vabau16(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i32> @vabau32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabau32: -;CHECK: vaba.u32 +; CHECK-LABEL: vabau32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vldr d18, [r0] +; CHECK-NEXT: vaba.u32 d18, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C @@ -67,8 +104,15 @@ define <2 x i32> @vabau32(ptr %A, ptr %B, ptr %C) nounwind { } define <16 x i8> @vabaQs8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQs8: -;CHECK: vaba.s8 +; CHECK-LABEL: vabaQs8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.s8 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = load <16 x i8>, ptr %C @@ -78,8 +122,15 @@ define <16 x i8> @vabaQs8(ptr %A, ptr %B, ptr %C) nounwind { } define <8 x i16> @vabaQs16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQs16: -;CHECK: vaba.s16 +; CHECK-LABEL: vabaQs16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.s16 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = load <8 x i16>, ptr %C @@ -89,8 +140,15 @@ define <8 x i16> @vabaQs16(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @vabaQs32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQs32: -;CHECK: vaba.s32 +; CHECK-LABEL: vabaQs32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.s32 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = load <4 x i32>, ptr %C @@ -100,8 +158,15 @@ define <4 x i32> @vabaQs32(ptr %A, ptr %B, ptr %C) nounwind { } define <16 x i8> @vabaQu8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQu8: -;CHECK: vaba.u8 +; CHECK-LABEL: vabaQu8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.u8 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = load <16 x i8>, ptr %C @@ -111,8 +176,15 @@ define <16 x i8> @vabaQu8(ptr %A, ptr %B, ptr %C) nounwind { } define <8 x i16> @vabaQu16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQu16: -;CHECK: vaba.u16 +; CHECK-LABEL: vabaQu16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.u16 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = load <8 x i16>, ptr %C @@ -122,8 +194,15 @@ define <8 x i16> @vabaQu16(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @vabaQu32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabaQu32: -;CHECK: vaba.u32 +; CHECK-LABEL: vabaQu32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r2] +; CHECK-NEXT: vld1.64 {d18, d19}, [r1] +; CHECK-NEXT: vld1.64 {d20, d21}, [r0] +; CHECK-NEXT: vaba.u32 q10, q9, q8 +; CHECK-NEXT: vmov r0, r1, d20 +; CHECK-NEXT: vmov r2, r3, d21 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = load <4 x i32>, ptr %C @@ -149,8 +228,15 @@ declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind read declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone define <8 x i16> @vabals8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabals8: -;CHECK: vabal.s8 +; CHECK-LABEL: vabals8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.s8 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C @@ -161,8 +247,15 @@ define <8 x i16> @vabals8(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @vabals16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabals16: -;CHECK: vabal.s16 +; CHECK-LABEL: vabals16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.s16 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C @@ -173,8 +266,15 @@ define <4 x i32> @vabals16(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i64> @vabals32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabals32: -;CHECK: vabal.s32 +; CHECK-LABEL: vabals32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.s32 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C @@ -185,8 +285,15 @@ define <2 x i64> @vabals32(ptr %A, ptr %B, ptr %C) nounwind { } define <8 x i16> @vabalu8(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabalu8: -;CHECK: vabal.u8 +; CHECK-LABEL: vabalu8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.u8 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = load <8 x i8>, ptr %C @@ -197,8 +304,15 @@ define <8 x i16> @vabalu8(ptr %A, ptr %B, ptr %C) nounwind { } define <4 x i32> @vabalu16(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabalu16: -;CHECK: vabal.u16 +; CHECK-LABEL: vabalu16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.u16 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = load <4 x i16>, ptr %C @@ -209,8 +323,15 @@ define <4 x i32> @vabalu16(ptr %A, ptr %B, ptr %C) nounwind { } define <2 x i64> @vabalu32(ptr %A, ptr %B, ptr %C) nounwind { -;CHECK-LABEL: vabalu32: -;CHECK: vabal.u32 +; CHECK-LABEL: vabalu32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r2] +; CHECK-NEXT: vldr d17, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabal.u32 q9, d17, d16 +; CHECK-NEXT: vmov r0, r1, d18 +; CHECK-NEXT: vmov r2, r3, d19 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i64>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = load <2 x i32>, ptr %C diff --git a/llvm/test/CodeGen/ARM/vabd.ll b/llvm/test/CodeGen/ARM/vabd.ll index eb5eed8..4184e92 100644 --- a/llvm/test/CodeGen/ARM/vabd.ll +++ b/llvm/test/CodeGen/ARM/vabd.ll @@ -1,8 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s define <8 x i8> @vabds8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabds8: -;CHECK: vabd.s8 +; CHECK-LABEL: vabds8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.s8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -10,8 +16,13 @@ define <8 x i8> @vabds8(ptr %A, ptr %B) nounwind { } define <4 x i16> @vabds16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabds16: -;CHECK: vabd.s16 +; CHECK-LABEL: vabds16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.s16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -19,8 +30,13 @@ define <4 x i16> @vabds16(ptr %A, ptr %B) nounwind { } define <2 x i32> @vabds32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabds32: -;CHECK: vabd.s32 +; CHECK-LABEL: vabds32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.s32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -28,8 +44,13 @@ define <2 x i32> @vabds32(ptr %A, ptr %B) nounwind { } define <8 x i8> @vabdu8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdu8: -;CHECK: vabd.u8 +; CHECK-LABEL: vabdu8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.u8 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -37,8 +58,13 @@ define <8 x i8> @vabdu8(ptr %A, ptr %B) nounwind { } define <4 x i16> @vabdu16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdu16: -;CHECK: vabd.u16 +; CHECK-LABEL: vabdu16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.u16 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -46,8 +72,13 @@ define <4 x i16> @vabdu16(ptr %A, ptr %B) nounwind { } define <2 x i32> @vabdu32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdu32: -;CHECK: vabd.u32 +; CHECK-LABEL: vabdu32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.u32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -55,8 +86,13 @@ define <2 x i32> @vabdu32(ptr %A, ptr %B) nounwind { } define <2 x float> @vabdf32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdf32: -;CHECK: vabd.f32 +; CHECK-LABEL: vabdf32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabd.f32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x float>, ptr %A %tmp2 = load <2 x float>, ptr %B %tmp3 = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) @@ -64,8 +100,14 @@ define <2 x float> @vabdf32(ptr %A, ptr %B) nounwind { } define <16 x i8> @vabdQs8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQs8: -;CHECK: vabd.s8 +; CHECK-LABEL: vabdQs8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.s8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -73,8 +115,14 @@ define <16 x i8> @vabdQs8(ptr %A, ptr %B) nounwind { } define <8 x i16> @vabdQs16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQs16: -;CHECK: vabd.s16 +; CHECK-LABEL: vabdQs16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.s16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -82,8 +130,14 @@ define <8 x i16> @vabdQs16(ptr %A, ptr %B) nounwind { } define <4 x i32> @vabdQs32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQs32: -;CHECK: vabd.s32 +; CHECK-LABEL: vabdQs32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.s32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -91,8 +145,14 @@ define <4 x i32> @vabdQs32(ptr %A, ptr %B) nounwind { } define <16 x i8> @vabdQu8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQu8: -;CHECK: vabd.u8 +; CHECK-LABEL: vabdQu8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.u8 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <16 x i8>, ptr %A %tmp2 = load <16 x i8>, ptr %B %tmp3 = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) @@ -100,8 +160,14 @@ define <16 x i8> @vabdQu8(ptr %A, ptr %B) nounwind { } define <8 x i16> @vabdQu16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQu16: -;CHECK: vabd.u16 +; CHECK-LABEL: vabdQu16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.u16 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i16>, ptr %A %tmp2 = load <8 x i16>, ptr %B %tmp3 = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) @@ -109,8 +175,14 @@ define <8 x i16> @vabdQu16(ptr %A, ptr %B) nounwind { } define <4 x i32> @vabdQu32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQu32: -;CHECK: vabd.u32 +; CHECK-LABEL: vabdQu32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.u32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i32>, ptr %A %tmp2 = load <4 x i32>, ptr %B %tmp3 = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) @@ -118,8 +190,14 @@ define <4 x i32> @vabdQu32(ptr %A, ptr %B) nounwind { } define <4 x float> @vabdQf32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdQf32: -;CHECK: vabd.f32 +; CHECK-LABEL: vabdQf32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.64 {d16, d17}, [r1] +; CHECK-NEXT: vld1.64 {d18, d19}, [r0] +; CHECK-NEXT: vabd.f32 q8, q9, q8 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x float>, ptr %A %tmp2 = load <4 x float>, ptr %B %tmp3 = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) @@ -147,8 +225,14 @@ declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind read declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone define <8 x i16> @vabdls8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdls8: -;CHECK: vabdl.s8 +; CHECK-LABEL: vabdls8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.s8 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -157,8 +241,14 @@ define <8 x i16> @vabdls8(ptr %A, ptr %B) nounwind { } define <4 x i32> @vabdls16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdls16: -;CHECK: vabdl.s16 +; CHECK-LABEL: vabdls16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.s16 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -167,8 +257,14 @@ define <4 x i32> @vabdls16(ptr %A, ptr %B) nounwind { } define <2 x i64> @vabdls32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdls32: -;CHECK: vabdl.s32 +; CHECK-LABEL: vabdls32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.s32 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) @@ -177,8 +273,14 @@ define <2 x i64> @vabdls32(ptr %A, ptr %B) nounwind { } define <8 x i16> @vabdlu8(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdlu8: -;CHECK: vabdl.u8 +; CHECK-LABEL: vabdlu8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.u8 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <8 x i8>, ptr %A %tmp2 = load <8 x i8>, ptr %B %tmp3 = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) @@ -187,8 +289,14 @@ define <8 x i16> @vabdlu8(ptr %A, ptr %B) nounwind { } define <4 x i32> @vabdlu16(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdlu16: -;CHECK: vabdl.u16 +; CHECK-LABEL: vabdlu16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.u16 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <4 x i16>, ptr %A %tmp2 = load <4 x i16>, ptr %B %tmp3 = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) @@ -197,8 +305,14 @@ define <4 x i32> @vabdlu16(ptr %A, ptr %B) nounwind { } define <2 x i64> @vabdlu32(ptr %A, ptr %B) nounwind { -;CHECK-LABEL: vabdlu32: -;CHECK: vabdl.u32 +; CHECK-LABEL: vabdlu32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr d16, [r1] +; CHECK-NEXT: vldr d17, [r0] +; CHECK-NEXT: vabdl.u32 q8, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: mov pc, lr %tmp1 = load <2 x i32>, ptr %A %tmp2 = load <2 x i32>, ptr %B %tmp3 = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll new file mode 100644 index 0000000..75f920b --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll @@ -0,0 +1,326 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc --mtriple=loongarch32 --mattr=+d < %s | FileCheck %s --check-prefix=LA32 +; RUN: llc --mtriple=loongarch64 --mattr=+d < %s | FileCheck %s --check-prefix=LA64 + +define void @test_load_store(ptr %p, ptr %q) nounwind { +; LA32-LABEL: test_load_store: +; LA32: # %bb.0: +; LA32-NEXT: ld.h $a0, $a0, 0 +; LA32-NEXT: st.h $a0, $a1, 0 +; LA32-NEXT: ret +; +; LA64-LABEL: test_load_store: +; LA64: # %bb.0: +; LA64-NEXT: ld.h $a0, $a0, 0 +; LA64-NEXT: st.h $a0, $a1, 0 +; LA64-NEXT: ret + %a = load half, ptr %p + store half %a, ptr %q + ret void +} + +define float @test_fpextend_float(ptr %p) nounwind { +; LA32-LABEL: test_fpextend_float: +; LA32: # %bb.0: +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: b %plt(__gnu_h2f_ieee) +; +; LA64-LABEL: test_fpextend_float: +; LA64: # %bb.0: +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: b %plt(__gnu_h2f_ieee) + %a = load half, ptr %p + %r = fpext half %a to float + ret float %r +} + +define double @test_fpextend_double(ptr %p) nounwind { +; LA32-LABEL: test_fpextend_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fcvt.d.s $fa0, $fa0 +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fpextend_double: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fcvt.d.s $fa0, $fa0 +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %a = load half, ptr %p + %r = fpext half %a to double + ret double %r +} + +define void @test_fptrunc_float(float %f, ptr %p) nounwind { +; LA32-LABEL: test_fptrunc_float: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fptrunc_float: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %a = fptrunc float %f to half + store half %a, ptr %p + ret void +} + +define void @test_fptrunc_double(double %d, ptr %p) nounwind { +; LA32-LABEL: test_fptrunc_double: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -16 +; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill +; LA32-NEXT: move $fp, $a0 +; LA32-NEXT: bl %plt(__truncdfhf2) +; LA32-NEXT: st.h $a0, $fp, 0 +; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 16 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fptrunc_double: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -16 +; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a0 +; LA64-NEXT: bl %plt(__truncdfhf2) +; LA64-NEXT: st.h $a0, $fp, 0 +; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 16 +; LA64-NEXT: ret + %a = fptrunc double %d to half + store half %a, ptr %p + ret void +} + +define half @test_fadd_reg(half %a, half %b) nounwind { +; LA32-LABEL: test_fadd_reg: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA32-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: fmov.s $fs0, $fa0 +; LA32-NEXT: fmov.s $fa0, $fa1 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmov.s $fs1, $fa0 +; LA32-NEXT: fmov.s $fa0, $fs0 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fadd.s $fa0, $fa0, $fs1 +; LA32-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fadd_reg: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: fmov.s $fs0, $fa0 +; LA64-NEXT: fmov.s $fa0, $fa1 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmov.s $fs1, $fa0 +; LA64-NEXT: fmov.s $fa0, $fs0 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fadd.s $fa0, $fa0, $fs1 +; LA64-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %r = fadd half %a, %b + ret half %r +} + +define void @test_fadd_mem(ptr %p, ptr %q) nounwind { +; LA32-LABEL: test_fadd_mem: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: move $fp, $a1 +; LA32-NEXT: move $s0, $a0 +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmov.s $fs0, $fa0 +; LA32-NEXT: ld.hu $a0, $fp, 0 +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fadd.s $fa0, $fs0, $fa0 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: st.h $a0, $s0, 0 +; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fadd_mem: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a1 +; LA64-NEXT: move $s0, $a0 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmov.s $fs0, $fa0 +; LA64-NEXT: ld.hu $a0, $fp, 0 +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fadd.s $fa0, $fs0, $fa0 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: st.h $a0, $s0, 0 +; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %a = load half, ptr %p + %b = load half, ptr %q + %r = fadd half %a, %b + store half %r, ptr %p + ret void +} + +define half @test_fmul_reg(half %a, half %b) nounwind { +; LA32-LABEL: test_fmul_reg: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA32-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: fmov.s $fs0, $fa0 +; LA32-NEXT: fmov.s $fa0, $fa1 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmov.s $fs1, $fa0 +; LA32-NEXT: fmov.s $fa0, $fs0 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmul.s $fa0, $fa0, $fs1 +; LA32-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fmul_reg: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs1, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: fmov.s $fs0, $fa0 +; LA64-NEXT: fmov.s $fa0, $fa1 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmov.s $fs1, $fa0 +; LA64-NEXT: fmov.s $fa0, $fs0 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmul.s $fa0, $fa0, $fs1 +; LA64-NEXT: fld.d $fs1, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: fld.d $fs0, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %r = fmul half %a, %b + ret half %r +} + +define void @test_fmul_mem(ptr %p, ptr %q) nounwind { +; LA32-LABEL: test_fmul_mem: +; LA32: # %bb.0: +; LA32-NEXT: addi.w $sp, $sp, -32 +; LA32-NEXT: st.w $ra, $sp, 28 # 4-byte Folded Spill +; LA32-NEXT: st.w $fp, $sp, 24 # 4-byte Folded Spill +; LA32-NEXT: st.w $s0, $sp, 20 # 4-byte Folded Spill +; LA32-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill +; LA32-NEXT: move $fp, $a1 +; LA32-NEXT: move $s0, $a0 +; LA32-NEXT: ld.hu $a0, $a0, 0 +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmov.s $fs0, $fa0 +; LA32-NEXT: ld.hu $a0, $fp, 0 +; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: fmul.s $fa0, $fs0, $fa0 +; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: st.h $a0, $s0, 0 +; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload +; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload +; LA32-NEXT: ld.w $fp, $sp, 24 # 4-byte Folded Reload +; LA32-NEXT: ld.w $ra, $sp, 28 # 4-byte Folded Reload +; LA32-NEXT: addi.w $sp, $sp, 32 +; LA32-NEXT: ret +; +; LA64-LABEL: test_fmul_mem: +; LA64: # %bb.0: +; LA64-NEXT: addi.d $sp, $sp, -32 +; LA64-NEXT: st.d $ra, $sp, 24 # 8-byte Folded Spill +; LA64-NEXT: st.d $fp, $sp, 16 # 8-byte Folded Spill +; LA64-NEXT: st.d $s0, $sp, 8 # 8-byte Folded Spill +; LA64-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill +; LA64-NEXT: move $fp, $a1 +; LA64-NEXT: move $s0, $a0 +; LA64-NEXT: ld.hu $a0, $a0, 0 +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmov.s $fs0, $fa0 +; LA64-NEXT: ld.hu $a0, $fp, 0 +; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: fmul.s $fa0, $fs0, $fa0 +; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: st.h $a0, $s0, 0 +; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload +; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload +; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload +; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload +; LA64-NEXT: addi.d $sp, $sp, 32 +; LA64-NEXT: ret + %a = load half, ptr %p + %b = load half, ptr %q + %r = fmul half %a, %b + store half %r, ptr %p + ret void +} diff --git a/llvm/test/CodeGen/LoongArch/sextw-removal.ll b/llvm/test/CodeGen/LoongArch/sextw-removal.ll index 502b6cf..ba889fc8 100644 --- a/llvm/test/CodeGen/LoongArch/sextw-removal.ll +++ b/llvm/test/CodeGen/LoongArch/sextw-removal.ll @@ -762,7 +762,6 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: ori $a3, $zero, 1 ; CHECK-NEXT: addi.w $a2, $zero, -1 -; CHECK-NEXT: lu32i.d $a2, 0 ; CHECK-NEXT: ori $a4, $zero, 1000 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB13_2: # =>This Inner Loop Header: Depth=1 @@ -772,10 +771,9 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) { ; CHECK-NEXT: addi.w $a3, $a3, 1 ; CHECK-NEXT: blt $a3, $a1, .LBB13_2 ; CHECK-NEXT: .LBB13_4: -; CHECK-NEXT: addi.w $a0, $a0, 0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB13_5: -; CHECK-NEXT: addi.w $a0, $a2, 0 +; CHECK-NEXT: move $a0, $a2 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test14: @@ -785,7 +783,6 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) { ; NORMV-NEXT: # %bb.1: # %.preheader ; NORMV-NEXT: ori $a3, $zero, 1 ; NORMV-NEXT: addi.w $a2, $zero, -1 -; NORMV-NEXT: lu32i.d $a2, 0 ; NORMV-NEXT: ori $a4, $zero, 1000 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB13_2: # =>This Inner Loop Header: Depth=1 @@ -795,13 +792,12 @@ define signext i32 @test14(i32 signext %0, i32 signext %1) { ; NORMV-NEXT: add.d $a0, $a3, $a0 ; NORMV-NEXT: addi.d $a3, $a3, 1 ; NORMV-NEXT: addi.w $a3, $a3, 0 -; NORMV-NEXT: addi.d $a0, $a0, 0 +; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: blt $a3, $a1, .LBB13_2 ; NORMV-NEXT: .LBB13_4: -; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: ret ; NORMV-NEXT: .LBB13_5: -; NORMV-NEXT: addi.w $a0, $a2, 0 +; NORMV-NEXT: move $a0, $a2 ; NORMV-NEXT: ret %3 = icmp sgt i32 %1, 1 br i1 %3, label %4, label %12 @@ -830,8 +826,7 @@ define signext i32 @test14b(i32 %0, i32 signext %1) { ; CHECK-NEXT: blt $a1, $a2, .LBB14_4 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: ori $a3, $zero, 1 -; CHECK-NEXT: addi.w $a2, $zero, -1 -; CHECK-NEXT: lu32i.d $a2, 0 +; CHECK-NEXT: addi.d $a2, $zero, -1 ; CHECK-NEXT: ori $a4, $zero, 1000 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB14_2: # =>This Inner Loop Header: Depth=1 @@ -854,8 +849,7 @@ define signext i32 @test14b(i32 %0, i32 signext %1) { ; NORMV-NEXT: blt $a1, $a2, .LBB14_4 ; NORMV-NEXT: # %bb.1: # %.preheader ; NORMV-NEXT: ori $a3, $zero, 1 -; NORMV-NEXT: addi.w $a2, $zero, -1 -; NORMV-NEXT: lu32i.d $a2, 0 +; NORMV-NEXT: addi.d $a2, $zero, -1 ; NORMV-NEXT: ori $a4, $zero, 1000 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB14_2: # =>This Inner Loop Header: Depth=1 @@ -900,8 +894,7 @@ define signext i32 @test14c(i32 zeroext %0, i32 signext %1) { ; CHECK-NEXT: blt $a1, $a2, .LBB15_4 ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: ori $a3, $zero, 1 -; CHECK-NEXT: addi.w $a2, $zero, -1 -; CHECK-NEXT: lu32i.d $a2, 0 +; CHECK-NEXT: addi.d $a2, $zero, -1 ; CHECK-NEXT: ori $a4, $zero, 1000 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB15_2: # =>This Inner Loop Header: Depth=1 @@ -924,8 +917,7 @@ define signext i32 @test14c(i32 zeroext %0, i32 signext %1) { ; NORMV-NEXT: blt $a1, $a2, .LBB15_4 ; NORMV-NEXT: # %bb.1: # %.preheader ; NORMV-NEXT: ori $a3, $zero, 1 -; NORMV-NEXT: addi.w $a2, $zero, -1 -; NORMV-NEXT: lu32i.d $a2, 0 +; NORMV-NEXT: addi.d $a2, $zero, -1 ; NORMV-NEXT: ori $a4, $zero, 1000 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB15_2: # =>This Inner Loop Header: Depth=1 @@ -971,7 +963,6 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) { ; CHECK-NEXT: # %bb.1: # %.preheader ; CHECK-NEXT: ori $a3, $zero, 1 ; CHECK-NEXT: addi.w $a2, $zero, -1 -; CHECK-NEXT: lu32i.d $a2, 0 ; CHECK-NEXT: ori $a4, $zero, 1000 ; CHECK-NEXT: .p2align 4, , 16 ; CHECK-NEXT: .LBB16_2: # =>This Inner Loop Header: Depth=1 @@ -981,10 +972,9 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) { ; CHECK-NEXT: addi.w $a3, $a3, 1 ; CHECK-NEXT: blt $a3, $a1, .LBB16_2 ; CHECK-NEXT: .LBB16_4: -; CHECK-NEXT: addi.w $a0, $a0, 0 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB16_5: -; CHECK-NEXT: addi.w $a0, $a2, 0 +; CHECK-NEXT: move $a0, $a2 ; CHECK-NEXT: ret ; ; NORMV-LABEL: test14d: @@ -994,7 +984,6 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) { ; NORMV-NEXT: # %bb.1: # %.preheader ; NORMV-NEXT: ori $a3, $zero, 1 ; NORMV-NEXT: addi.w $a2, $zero, -1 -; NORMV-NEXT: lu32i.d $a2, 0 ; NORMV-NEXT: ori $a4, $zero, 1000 ; NORMV-NEXT: .p2align 4, , 16 ; NORMV-NEXT: .LBB16_2: # =>This Inner Loop Header: Depth=1 @@ -1004,13 +993,12 @@ define signext i32 @test14d(i31 zeroext %0, i32 signext %1) { ; NORMV-NEXT: add.d $a0, $a3, $a0 ; NORMV-NEXT: addi.d $a3, $a3, 1 ; NORMV-NEXT: addi.w $a3, $a3, 0 -; NORMV-NEXT: addi.d $a0, $a0, 0 +; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: blt $a3, $a1, .LBB16_2 ; NORMV-NEXT: .LBB16_4: -; NORMV-NEXT: addi.w $a0, $a0, 0 ; NORMV-NEXT: ret ; NORMV-NEXT: .LBB16_5: -; NORMV-NEXT: addi.w $a0, $a2, 0 +; NORMV-NEXT: move $a0, $a2 ; NORMV-NEXT: ret %zext = zext i31 %0 to i32 %3 = icmp sgt i32 %1, 1 diff --git a/llvm/test/CodeGen/NVPTX/intr-range.ll b/llvm/test/CodeGen/NVPTX/intr-range.ll new file mode 100644 index 0000000..2f3e08a --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/intr-range.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 5 +; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -mcpu=sm_20 -passes=nvvm-intr-range | FileCheck %s + +define i32 @test_maxntid() { +; CHECK-LABEL: define i32 @test_maxntid( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 0, 96) i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP11:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 97) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: ret i32 [[TMP5]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 + ret i32 %11 +} + +define i32 @test_reqntid() { +; CHECK-LABEL: define i32 @test_reqntid( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: [[TMP5:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.y() +; CHECK-NEXT: [[TMP2:%.*]] = call range(i32 0, 20) i32 @llvm.nvvm.read.ptx.sreg.tid.z() +; CHECK-NEXT: [[TMP4:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +; CHECK-NEXT: [[TMP3:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +; CHECK-NEXT: [[TMP6:%.*]] = call range(i32 1, 21) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() +; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP1]], [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[TMP2]] +; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], [[TMP4]] +; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP3]] +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP6]] +; CHECK-NEXT: ret i32 [[TMP3]] +; + %1 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() + %2 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() + %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() + %4 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() + %5 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() + %6 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + %7 = add i32 %1, %2 + %8 = add i32 %7, %3 + %9 = add i32 %8, %4 + %10 = add i32 %9, %5 + %11 = add i32 %10, %6 + ret i32 %5 +} + +;; A case like this could occur if a function with the sreg intrinsic was +;; inlined into a kernel where the tid metadata is present, ensure the range is +;; updated. +define i32 @test_inlined() { +; CHECK-LABEL: define i32 @test_inlined( +; CHECK-SAME: ) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call range(i32 0, 4) i32 @llvm.nvvm.read.ptx.sreg.tid.x() +; CHECK-NEXT: ret i32 [[TMP1]] +; + %1 = call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() + ret i32 %1 +} + +declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.tid.z() + +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y() +declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z() + +!nvvm.annotations = !{!0, !1, !2} +!0 = !{ptr @test_maxntid, !"kernel", i32 1, !"maxntidx", i32 32, !"maxntidz", i32 3} +!1 = !{ptr @test_reqntid, !"kernel", i32 1, !"reqntidx", i32 20} +!2 = !{ptr @test_inlined, !"kernel", i32 1, !"maxntidx", i32 4} diff --git a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll index 3930e6d7..85f7817 100644 --- a/llvm/test/CodeGen/NVPTX/intrinsic-old.ll +++ b/llvm/test/CodeGen/NVPTX/intrinsic-old.ll @@ -1,21 +1,13 @@ ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck -allow-deprecated-dag-overlap %s ; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda -passes=nvvm-intr-range \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_20 %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s -; RUN: opt < %s -S -mtriple=nvptx-nvidia-cuda \ -; RUN: -passes=nvvm-intr-range -nvvm-intr-range-sm=30 \ -; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE --check-prefix=RANGE_30 %s +; RUN: | FileCheck -allow-deprecated-dag-overlap --check-prefix=RANGE %s ; RUN: %if ptxas && !ptxas-12.0 %{ llc < %s -march=nvptx -mcpu=sm_20 | %ptxas-verify %} ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %} define ptx_device i32 @test_tid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.x; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[BLK_IDX_XY:[0-9]+]] +; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() ret i32 %x @@ -23,7 +15,7 @@ define ptx_device i32 @test_tid_x() { define ptx_device i32 @test_tid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.y(), !range ![[BLK_IDX_XY]] +; RANGE: call range(i32 0, 1024) i32 @llvm.nvvm.read.ptx.sreg.tid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.y() ret i32 %x @@ -31,7 +23,7 @@ define ptx_device i32 @test_tid_y() { define ptx_device i32 @test_tid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %tid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.tid.z(), !range ![[BLK_IDX_Z:[0-9]+]] +; RANGE: call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.tid.z() ret i32 %x @@ -46,7 +38,7 @@ define ptx_device i32 @test_tid_w() { define ptx_device i32 @test_ntid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.x; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !range ![[BLK_SIZE_XY:[0-9]+]] +; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() ret i32 %x @@ -54,7 +46,7 @@ define ptx_device i32 @test_ntid_x() { define ptx_device i32 @test_ntid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.y(), !range ![[BLK_SIZE_XY]] +; RANGE: call range(i32 1, 1025) i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y() ret i32 %x @@ -62,7 +54,7 @@ define ptx_device i32 @test_ntid_y() { define ptx_device i32 @test_ntid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ntid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ntid.z(), !range ![[BLK_SIZE_Z:[0-9]+]] +; RANGE: call range(i32 1, 65) i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z() ret i32 %x @@ -77,7 +69,7 @@ define ptx_device i32 @test_ntid_w() { define ptx_device i32 @test_laneid() { ; CHECK: mov.u32 %r{{[0-9]+}}, %laneid; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.laneid(), !range ![[LANEID:[0-9]+]] +; RANGE: call range(i32 0, 32) i32 @llvm.nvvm.read.ptx.sreg.laneid() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.laneid() ret i32 %x @@ -85,7 +77,7 @@ define ptx_device i32 @test_laneid() { define ptx_device i32 @test_warpsize() { ; CHECK: mov.u32 %r{{[0-9]+}}, WARP_SZ; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.warpsize(), !range ![[WARPSIZE:[0-9]+]] +; RANGE: call range(i32 32, 33) i32 @llvm.nvvm.read.ptx.sreg.warpsize() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() ret i32 %x @@ -107,7 +99,7 @@ define ptx_device i32 @test_nwarpid() { define ptx_device i32 @test_ctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range ![[GRID_IDX_YZ:[0-9]+]] +; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y() ret i32 %x @@ -115,7 +107,7 @@ define ptx_device i32 @test_ctaid_y() { define ptx_device i32 @test_ctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z(), !range ![[GRID_IDX_YZ]] +; RANGE: call range(i32 0, 65535) i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z() ret i32 %x @@ -123,8 +115,7 @@ define ptx_device i32 @test_ctaid_z() { define ptx_device i32 @test_ctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %ctaid.x; -; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_X:[0-9]+]] -; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[GRID_IDX_YZ]] +; RANGE: call range(i32 0, 2147483647) i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() ret i32 %x @@ -139,7 +130,7 @@ define ptx_device i32 @test_ctaid_w() { define ptx_device i32 @test_nctaid_y() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.y; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y(), !range ![[GRID_SIZE_YZ:[0-9]+]] +; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y() ret i32 %x @@ -147,7 +138,7 @@ define ptx_device i32 @test_nctaid_y() { define ptx_device i32 @test_nctaid_z() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.z; -; RANGE: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z(), !range ![[GRID_SIZE_YZ]] +; RANGE: call range(i32 1, 65536) i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z() ret i32 %x @@ -155,8 +146,7 @@ define ptx_device i32 @test_nctaid_z() { define ptx_device i32 @test_nctaid_x() { ; CHECK: mov.u32 %r{{[0-9]+}}, %nctaid.x; -; RANGE_30: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_X:[0-9]+]] -; RANGE_20: call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x(), !range ![[GRID_SIZE_YZ]] +; RANGE: call range(i32 1, -2147483648) i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ; CHECK: ret; %x = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x() ret i32 %x @@ -327,14 +317,3 @@ declare void @llvm.nvvm.bar.sync(i32 %i) !0 = !{i32 0, i32 19} ; RANGE-DAG: ![[ALREADY]] = !{i32 0, i32 19} -; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024} -; RANGE-DAG: ![[BLK_IDX_XY]] = !{i32 0, i32 1024} -; RANGE-DAG: ![[BLK_IDX_Z]] = !{i32 0, i32 64} -; RANGE-DAG: ![[BLK_SIZE_XY]] = !{i32 1, i32 1025} -; RANGE-DAG: ![[BLK_SIZE_Z]] = !{i32 1, i32 65} -; RANGE-DAG: ![[LANEID]] = !{i32 0, i32 32} -; RANGE-DAG: ![[WARPSIZE]] = !{i32 32, i32 33} -; RANGE_30-DAG: ![[GRID_IDX_X]] = !{i32 0, i32 2147483647} -; RANGE-DAG: ![[GRID_IDX_YZ]] = !{i32 0, i32 65535} -; RANGE_30-DAG: ![[GRID_SIZE_X]] = !{i32 1, i32 -2147483648} -; RANGE-DAG: ![[GRID_SIZE_YZ]] = !{i32 1, i32 65536} diff --git a/llvm/test/CodeGen/PowerPC/toc-data-common.ll b/llvm/test/CodeGen/PowerPC/toc-data-common.ll index 7747f2e..3b7ca44 100644 --- a/llvm/test/CodeGen/PowerPC/toc-data-common.ll +++ b/llvm/test/CodeGen/PowerPC/toc-data-common.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs < %s | FileCheck %s -DINSTR=lwz --check-prefix=CHECK -; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -verify-machineinstrs < %s | FileCheck %s -DINSTR=ld --check-prefix=CHECK +; RUN: llc -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK +; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK-64 ; RUN: llc -filetype=obj -mtriple powerpc-ibm-aix-xcoff -verify-machineinstrs < %s -o %t32.o ; RUN: llvm-objdump -t --symbol-description %t32.o | FileCheck %s --check-prefix=OBJ32 @@ -15,16 +15,28 @@ define void @set(i32 noundef %_a) { ; CHECK-LABEL: set: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: la 4, a2[TD](2) -; CHECK-NEXT: la 5, a1[TD](2) -; CHECK-NEXT: stw 3, 0(4) -; CHECK-NEXT: [[INSTR]] 4, L..C0(2) # @a4 -; CHECK-NEXT: stw 3, 0(5) -; CHECK-NEXT: [[INSTR]] 5, L..C1(2) # @a3 -; CHECK-NEXT: stw 3, 0(4) -; CHECK-NEXT: stw 3, 0(5) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: la 4, a2[TD](2) +; CHECK-NEXT: lwz 5, L..C0(2) # @a4 +; CHECK-NEXT: stw 3, 0(4) +; CHECK-NEXT: la 4, a1[TD](2) +; CHECK-NEXT: stw 3, 0(4) +; CHECK-NEXT: lwz 4, L..C1(2) # @a3 +; CHECK-NEXT: stw 3, 0(5) +; CHECK-NEXT: stw 3, 0(4) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: set: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: la 4, a2[TD](2) +; CHECK-64-NEXT: ld 5, L..C0(2) # @a4 +; CHECK-64-NEXT: stw 3, 0(4) +; CHECK-64-NEXT: la 4, a1[TD](2) +; CHECK-64-NEXT: stw 3, 0(4) +; CHECK-64-NEXT: ld 4, L..C1(2) # @a3 +; CHECK-64-NEXT: stw 3, 0(5) +; CHECK-64-NEXT: stw 3, 0(4) +; CHECK-64-NEXT: blr entry: store i32 %_a, ptr @a2, align 4 store i32 %_a, ptr @a1, align 4 @@ -35,10 +47,16 @@ ret void define i32 @get1() { ; CHECK-LABEL: get1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: la 3, a2[TD](2) -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: la 3, a2[TD](2) +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: get1: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: la 3, a2[TD](2) +; CHECK-64-NEXT: lwz 3, 0(3) +; CHECK-64-NEXT: blr entry: %0 = load i32, ptr @a2, align 4 ret i32 %0 @@ -46,10 +64,16 @@ ret i32 %0 define i32 @get2() { ; CHECK-LABEL: get2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: la 3, a1[TD](2) -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: la 3, a1[TD](2) +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: get2: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: la 3, a1[TD](2) +; CHECK-64-NEXT: lwz 3, 0(3) +; CHECK-64-NEXT: blr entry: %0 = load i32, ptr @a1, align 4 ret i32 %0 @@ -57,10 +81,16 @@ ret i32 %0 define i32 @get3() { ; CHECK-LABEL: get3: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: [[INSTR]] 3, L..C0(2) # @a4 -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz 3, L..C0(2) # @a4 +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: get3: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 3, L..C0(2) # @a4 +; CHECK-64-NEXT: lwz 3, 0(3) +; CHECK-64-NEXT: blr entry: %0 = load i32, ptr @a4, align 4 ret i32 %0 @@ -68,10 +98,16 @@ ret i32 %0 define i32 @get4() { ; CHECK-LABEL: get4: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: [[INSTR]] 3, L..C1(2) # @a3 -; CHECK-NEXT: lwz 3, 0(3) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz 3, L..C1(2) # @a3 +; CHECK-NEXT: lwz 3, 0(3) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: get4: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 3, L..C1(2) # @a3 +; CHECK-64-NEXT: lwz 3, 0(3) +; CHECK-64-NEXT: blr entry: %0 = load i32, ptr @a3, align 4 ret i32 %0 @@ -79,36 +115,56 @@ ret i32 %0 define nonnull ptr @escape1() { ; CHECK-LABEL: escape1: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: la 3, a2[TD](2) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: la 3, a2[TD](2) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: escape1: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: la 3, a2[TD](2) +; CHECK-64-NEXT: blr entry: ret ptr @a2 } define nonnull ptr @escape2() { ; CHECK-LABEL: escape2: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: la 3, a1[TD](2) -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: la 3, a1[TD](2) +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: escape2: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: la 3, a1[TD](2) +; CHECK-64-NEXT: blr entry: ret ptr @a1 } define nonnull ptr @escape3() { ; CHECK-LABEL: escape3: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: [[INSTR]] 3, L..C0(2) # @a4 -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz 3, L..C0(2) # @a4 +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: escape3: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 3, L..C0(2) # @a4 +; CHECK-64-NEXT: blr entry: ret ptr @a4 } define nonnull ptr @escape4() { ; CHECK-LABEL: escape4: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: [[INSTR]] 3, L..C1(2) # @a3 -; CHECK-NEXT: blr +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lwz 3, L..C1(2) # @a3 +; CHECK-NEXT: blr +; +; CHECK-64-LABEL: escape4: +; CHECK-64: # %bb.0: # %entry +; CHECK-64-NEXT: ld 3, L..C1(2) # @a3 +; CHECK-64-NEXT: blr entry: ret ptr @a3 } diff --git a/llvm/test/CodeGen/PowerPC/toc-data.ll b/llvm/test/CodeGen/PowerPC/toc-data.ll index 1228665..ee1dde1 100644 --- a/llvm/test/CodeGen/PowerPC/toc-data.ll +++ b/llvm/test/CodeGen/PowerPC/toc-data.ll @@ -36,7 +36,7 @@ define dso_local void @write_int(i32 signext %in) { ret void } ; CHECK32: name: write_int -; CHECK32: %[[SCRATCH:[0-9]+]]:gprc_and_gprc_nor0 = ADDItoc @i, $r2 +; CHECK32: %[[SCRATCH:[0-9]+]]:gprc_and_gprc_nor0 = ADDItoc $r2, @i ; CHECK32-NEXT: STW %{{[0-9]+}}, 0, killed %[[SCRATCH]] :: (store (s32) into @i) ; TEST32: .write_int: @@ -44,12 +44,12 @@ define dso_local void @write_int(i32 signext %in) { ; TEST32-NEXT: stw 3, 0(4) ; CHECK64: name: write_int -; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2 +; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 $x2, @i ; CHECK64-NEXT: STW8 %{{[0-9]+}}, 0, killed %[[SCRATCH]] :: (store (s32) into @i) ; CHECK64-NOOPT: name: write_int ; CHECK64-NOOPT: %[[SUBREG:[0-9]+]]:gprc = COPY %{{[0-9]}}.sub_32 -; CHECK64-NOOPT: %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2 +; CHECK64-NOOPT: %[[ADDR:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 $x2, @i ; CHECK64-NOOPT: STW %[[SUBREG]], 0, %[[ADDR]] ; TEST64: .write_int: @@ -128,7 +128,7 @@ define dso_local float @read_float() { ret float %0 } ; CHECK32: name: read_float -; CHECK32: %[[SCRATCH:[0-9]+]]:gprc_and_gprc_nor0 = ADDItoc @f, $r2 +; CHECK32: %[[SCRATCH:[0-9]+]]:gprc_and_gprc_nor0 = ADDItoc $r2, @f ; CHECK32: %{{[0-9]+}}:f4rc = LFS 0, killed %[[SCRATCH]] :: (dereferenceable load (s32) from @f) ; TEST32: .read_float: @@ -136,11 +136,11 @@ define dso_local float @read_float() { ; TEST32-NEXT: lfs 1, 0(3) ; CHECK64: name: read_float -; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @f, $x2 +; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 $x2, @f ; CHECK64: %{{[0-9]+}}:f4rc = LFS 0, killed %[[SCRATCH]] :: (dereferenceable load (s32) from @f) ; CHECK64-NOOPT: name: read_float -; CHECK64-NOOPT: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @f, $x2 +; CHECK64-NOOPT: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 $x2, @f ; CHECK64-NOOPT: %{{[0-9]+}}:f4rc = LFS 0, %[[SCRATCH]] ; TEST64: .read_float: @@ -217,18 +217,18 @@ define dso_local nonnull ptr @addr() { ret ptr @i } ; CHECK32: name: addr -; CHECK32: %[[SCRATCH:[0-9]+]]:gprc = ADDItoc @i, $r2 +; CHECK32: %[[SCRATCH:[0-9]+]]:gprc = ADDItoc $r2, @i ; CHECK32-NEXT: $r3 = COPY %[[SCRATCH]] ; TEST32: .addr ; TEST32: la 3, i[TD](2) ; CHECK64: name: addr -; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc = ADDItoc8 @i, $x2 +; CHECK64: %[[SCRATCH:[0-9]+]]:g8rc = ADDItoc8 $x2, @i ; CHECK64-NEXT: $x3 = COPY %[[SCRATCH]] ; CHECK64-NOOPT: name: addr -; CHECK64-NOOPT: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 @i, $x2 +; CHECK64-NOOPT: %[[SCRATCH:[0-9]+]]:g8rc_and_g8rc_nox0 = ADDItoc8 $x2, @i ; CHECK64-NOOPT: $x3 = COPY %[[SCRATCH]] ; TEST64: .addr diff --git a/llvm/test/CodeGen/PowerPC/tocdata-firm-alignment.ll b/llvm/test/CodeGen/PowerPC/tocdata-firm-alignment.ll new file mode 100644 index 0000000..c982713 --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/tocdata-firm-alignment.ll @@ -0,0 +1,24 @@ +; RUN: opt -S -passes='default<O3>' < %s | FileCheck %s + +target datalayout = "E-m:a-p:32:32-Fi32-i64:64-n32" +target triple = "powerpc-ibm-aix7.2.0.0" + +%struct.widget = type { i8, i8, i8 } + +; CHECK: @global = {{.*}}constant %struct.widget { i8 4, i8 0, i8 0 }, align 8 #0 +@global = constant %struct.widget { i8 4, i8 0, i8 0 }, align 4 #0 + +define void @baz() #1 { +bb: + call void @snork(ptr @global) + ret void +} + +define void @snork(ptr byval(%struct.widget) %arg) #1 { +bb: + %load = load volatile ptr, ptr null, align 4 + ret void +} + +attributes #0 = { "toc-data" } +attributes #1 = { "target-cpu"="pwr7" "target-features"="+altivec,+bpermd,+extdiv,+isa-v206-instructions,+vsx,-aix-shared-lib-tls-model-opt,-aix-small-local-dynamic-tls,-aix-small-local-exec-tls,-crbits,-crypto,-direct-move,-htm,-isa-v207-instructions,-isa-v30-instructions,-power8-vector,-power9-vector,-privileged,-quadword-atomics,-rop-protect,-spe" } diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/merge-unmerge-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/merge-unmerge-rv32.mir index 2e4a39c..46a7df4 100644 --- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/merge-unmerge-rv32.mir +++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/merge-unmerge-rv32.mir @@ -68,12 +68,14 @@ body: | ; RV32: liveins: $x10 ; RV32-NEXT: {{ $}} ; RV32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10 - ; RV32-NEXT: $x10 = COPY [[COPY]](s32) + ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535 + ; RV32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]] + ; RV32-NEXT: $x10 = COPY [[AND]](s32) ; RV32-NEXT: PseudoRET implicit $x10 %0:_(s32) = COPY $x10 - %1:_(s64) = G_ZEXT %0(s32) - %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64) - $x10 = COPY %2(s32) + %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0(s32) + %4:_(s32) = G_ZEXT %2(s16) + $x10 = COPY %4(s32) PseudoRET implicit $x10 ... --- diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector-csr.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector-csr.ll index 8210ea2..ac74a82 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector-csr.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-vector-csr.ll @@ -22,7 +22,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %a, <vscale x 1 x double ; SPILL-O0-NEXT: addi a1, a1, 16 ; SPILL-O0-NEXT: vs1r.v v9, (a1) # Unknown-size Folded Spill ; SPILL-O0-NEXT: # implicit-def: $v8 -; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, tu, ma ; SPILL-O0-NEXT: vfadd.vv v8, v9, v10 ; SPILL-O0-NEXT: addi a0, sp, 16 ; SPILL-O0-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -38,7 +38,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %a, <vscale x 1 x double ; SPILL-O0-NEXT: # kill: def $x11 killed $x10 ; SPILL-O0-NEXT: lw a0, 8(sp) # 4-byte Folded Reload ; SPILL-O0-NEXT: # implicit-def: $v8 -; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, tu, ma ; SPILL-O0-NEXT: vfadd.vv v8, v9, v10 ; SPILL-O0-NEXT: csrr a0, vlenb ; SPILL-O0-NEXT: slli a0, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector-csr.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector-csr.ll index 3523629..9054048 100644 --- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector-csr.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-vector-csr.ll @@ -25,7 +25,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %a, <vscale x 1 x double ; SPILL-O0-NEXT: addi a1, a1, 32 ; SPILL-O0-NEXT: vs1r.v v9, (a1) # Unknown-size Folded Spill ; SPILL-O0-NEXT: # implicit-def: $v8 -; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, tu, ma ; SPILL-O0-NEXT: vfadd.vv v8, v9, v10 ; SPILL-O0-NEXT: addi a0, sp, 32 ; SPILL-O0-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill @@ -41,7 +41,7 @@ define <vscale x 1 x double> @foo(<vscale x 1 x double> %a, <vscale x 1 x double ; SPILL-O0-NEXT: # kill: def $x11 killed $x10 ; SPILL-O0-NEXT: ld a0, 16(sp) # 8-byte Folded Reload ; SPILL-O0-NEXT: # implicit-def: $v8 -; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; SPILL-O0-NEXT: vsetvli zero, a0, e64, m1, tu, ma ; SPILL-O0-NEXT: vfadd.vv v8, v9, v10 ; SPILL-O0-NEXT: csrr a0, vlenb ; SPILL-O0-NEXT: slli a0, a0, 1 diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsbf.ll b/llvm/test/CodeGen/RISCV/rvv/vmsbf.ll index 14a1f084..d1f344d 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsbf.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsbf.ll @@ -33,7 +33,7 @@ define <vscale x 1 x i1> @intrinsic_vmsbf_mask_m_nxv1i1_nxv1i1(<vscale x 1 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -75,7 +75,7 @@ define <vscale x 2 x i1> @intrinsic_vmsbf_mask_m_nxv2i1_nxv2i1(<vscale x 2 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -117,7 +117,7 @@ define <vscale x 4 x i1> @intrinsic_vmsbf_mask_m_nxv4i1_nxv4i1(<vscale x 4 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -159,9 +159,9 @@ define <vscale x 8 x i1> @intrinsic_vmsbf_mask_m_nxv8i1_nxv8i1(<vscale x 8 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret entry: %a = call <vscale x 8 x i1> @llvm.riscv.vmsbf.mask.nxv8i1( @@ -201,7 +201,7 @@ define <vscale x 16 x i1> @intrinsic_vmsbf_mask_m_nxv16i1_nxv16i1(<vscale x 16 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -243,7 +243,7 @@ define <vscale x 32 x i1> @intrinsic_vmsbf_mask_m_nxv32i1_nxv32i1(<vscale x 32 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -285,7 +285,7 @@ define <vscale x 64 x i1> @intrinsic_vmsbf_mask_m_nxv64i1_nxv64i1(<vscale x 64 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vmsbf.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsif.ll b/llvm/test/CodeGen/RISCV/rvv/vmsif.ll index 05d402a..1dc52eb 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsif.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsif.ll @@ -33,7 +33,7 @@ define <vscale x 1 x i1> @intrinsic_vmsif_mask_m_nxv1i1_nxv1i1(<vscale x 1 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -75,7 +75,7 @@ define <vscale x 2 x i1> @intrinsic_vmsif_mask_m_nxv2i1_nxv2i1(<vscale x 2 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -117,7 +117,7 @@ define <vscale x 4 x i1> @intrinsic_vmsif_mask_m_nxv4i1_nxv4i1(<vscale x 4 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -159,9 +159,9 @@ define <vscale x 8 x i1> @intrinsic_vmsif_mask_m_nxv8i1_nxv8i1(<vscale x 8 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret entry: %a = call <vscale x 8 x i1> @llvm.riscv.vmsif.mask.nxv8i1( @@ -201,7 +201,7 @@ define <vscale x 16 x i1> @intrinsic_vmsif_mask_m_nxv16i1_nxv16i1(<vscale x 16 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -243,7 +243,7 @@ define <vscale x 32 x i1> @intrinsic_vmsif_mask_m_nxv32i1_nxv32i1(<vscale x 32 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -285,7 +285,7 @@ define <vscale x 64 x i1> @intrinsic_vmsif_mask_m_nxv64i1_nxv64i1(<vscale x 64 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vmsif.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vmsof.ll b/llvm/test/CodeGen/RISCV/rvv/vmsof.ll index 0c60681..b0a28e6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vmsof.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vmsof.ll @@ -33,7 +33,7 @@ define <vscale x 1 x i1> @intrinsic_vmsof_mask_m_nxv1i1_nxv1i1(<vscale x 1 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -75,7 +75,7 @@ define <vscale x 2 x i1> @intrinsic_vmsof_mask_m_nxv2i1_nxv2i1(<vscale x 2 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf4, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -117,7 +117,7 @@ define <vscale x 4 x i1> @intrinsic_vmsof_mask_m_nxv4i1_nxv4i1(<vscale x 4 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, mf2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -159,9 +159,9 @@ define <vscale x 8 x i1> @intrinsic_vmsof_mask_m_nxv8i1_nxv8i1(<vscale x 8 x i1> ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t -; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vmv.v.v v0, v10 ; CHECK-NEXT: ret entry: %a = call <vscale x 8 x i1> @llvm.riscv.vmsof.mask.nxv8i1( @@ -201,7 +201,7 @@ define <vscale x 16 x i1> @intrinsic_vmsof_mask_m_nxv16i1_nxv16i1(<vscale x 16 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m2, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -243,7 +243,7 @@ define <vscale x 32 x i1> @intrinsic_vmsof_mask_m_nxv32i1_nxv32i1(<vscale x 32 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m4, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret @@ -285,7 +285,7 @@ define <vscale x 64 x i1> @intrinsic_vmsof_mask_m_nxv64i1_nxv64i1(<vscale x 64 x ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmv1r.v v10, v0 ; CHECK-NEXT: vmv1r.v v0, v9 -; CHECK-NEXT: vsetvli zero, a0, e8, m8, tu, mu +; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, mu ; CHECK-NEXT: vmsof.m v10, v8, v0.t ; CHECK-NEXT: vmv1r.v v0, v10 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vsll.ll b/llvm/test/CodeGen/RISCV/rvv/vsll.ll index 1fdafd7..a089b10 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vsll.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsll.ll @@ -2108,6 +2108,22 @@ entry: ret <vscale x 1 x i8> %a } +define <vscale x 1 x i8> @intrinsic_vsll_1_tu_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, iXLen %2) nounwind { +; CHECK-LABEL: intrinsic_vsll_1_tu_nxv1i8_nxv1i8_i8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli zero, a0, e8, mf8, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v9 +; CHECK-NEXT: ret +entry: + %a = call <vscale x 1 x i8> @llvm.riscv.vsll.nxv1i8( + <vscale x 1 x i8> %0, + <vscale x 1 x i8> %1, + iXLen 1, + iXLen %2) + + ret <vscale x 1 x i8> %a +} + define <vscale x 1 x i8> @intrinsic_vsll_mask_vi_nxv1i8_nxv1i8_i8(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i1> %2, iXLen %3) nounwind { ; CHECK-LABEL: intrinsic_vsll_mask_vi_nxv1i8_nxv1i8_i8: ; CHECK: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/SPIRV/event-wait-ptr-type.ll b/llvm/test/CodeGen/SPIRV/event-wait-ptr-type.ll index d6fb70b..ec9afc7 100644 --- a/llvm/test/CodeGen/SPIRV/event-wait-ptr-type.ll +++ b/llvm/test/CodeGen/SPIRV/event-wait-ptr-type.ll @@ -4,16 +4,16 @@ ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK: %[[#EventTy:]] = OpTypeEvent -; CHECK: %[[#StructEventTy:]] = OpTypeStruct %[[#EventTy]] -; CHECK: %[[#GenPtrStructEventTy:]] = OpTypePointer Generic %[[#StructEventTy]] -; CHECK: %[[#FunPtrStructEventTy:]] = OpTypePointer Function %[[#StructEventTy]] -; CHECK: %[[#GenPtrEventTy:]] = OpTypePointer Generic %[[#EventTy:]] +; CHECK-DAG: %[[#EventTy:]] = OpTypeEvent +; CHECK-DAG: %[[#StructEventTy:]] = OpTypeStruct %[[#EventTy]] +; CHECK-DAG: %[[#FunPtrStructEventTy:]] = OpTypePointer Function %[[#StructEventTy]] +; CHECK-DAG: %[[#GenPtrEventTy:]] = OpTypePointer Generic %[[#EventTy]] +; CHECK-DAG: %[[#FunPtrEventTy:]] = OpTypePointer Function %[[#EventTy]] ; CHECK: OpFunction ; CHECK: %[[#Var:]] = OpVariable %[[#FunPtrStructEventTy]] Function -; CHECK-NEXT: %[[#AddrspacecastVar:]] = OpPtrCastToGeneric %[[#GenPtrStructEventTy]] %[[#Var]] -; CHECK-NEXT: %[[#BitcastVar:]] = OpBitcast %[[#GenPtrEventTy]] %[[#AddrspacecastVar]] -; CHECK-NEXT: OpGroupWaitEvents %[[#]] %[[#]] %[[#BitcastVar]] +; CHECK-NEXT: %[[#FunEvent:]] = OpBitcast %[[#FunPtrEventTy]] %[[#Var]] +; CHECK-NEXT: %[[#GenEvent:]] = OpPtrCastToGeneric %[[#GenPtrEventTy]] %[[#FunEvent]] +; CHECK-NEXT: OpGroupWaitEvents %[[#]] %[[#]] %[[#GenEvent]] %"class.sycl::_V1::device_event" = type { target("spirv.Event") } diff --git a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll index 7056b9c..9db4f26 100644 --- a/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll +++ b/llvm/test/CodeGen/SPIRV/passes/SPIRVEmitIntrinsics-no-duplicate-spv_assign_type.ll @@ -3,9 +3,9 @@ ; CHECK: *** IR Dump After SPIRV emit intrinsics (emit-intrinsics) *** define spir_kernel void @test(ptr addrspace(1) %srcimg) { -; CHECK: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) undef) +; CHECK: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) %call1 = call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1) %srcimg) -; CHECK-NOT: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) undef) +; CHECK-NOT: call void @llvm.spv.assign.type.p1(ptr addrspace(1) %srcimg, metadata target("spirv.Image", void, 1, 0, 0, 0, 0, 0, 0) poison) %call2 = call spir_func <2 x i32> @_Z13get_image_dim14ocl_image2d_ro(ptr addrspace(1) %srcimg) ret void ; CHECK: } diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpGroupAsyncCopy-strided.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpGroupAsyncCopy-strided.ll new file mode 100644 index 0000000..96d6016 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/transcoding/OpGroupAsyncCopy-strided.ll @@ -0,0 +1,36 @@ +; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; CHECK-SPIRV-DAG: %[[#LongTy:]] = OpTypeInt 64 0 +; CHECK-SPIRV-DAG: %[[#IntTy:]] = OpTypeInt 32 0 +; CHECK-SPIRV-DAG: %[[#EventTy:]] = OpTypeEvent +; CHECK-SPIRV-DAG: %[[#Scope:]] = OpConstant %[[#IntTy]] 2 +; CHECK-SPIRV-DAG: %[[#Num:]] = OpConstant %[[#LongTy]] 123 +; CHECK-SPIRV-DAG: %[[#Null:]] = OpConstantNull +; CHECK-SPIRV-DAG: %[[#Stride:]] = OpConstant %[[#LongTy]] 1 +; CHECK-SPIRV-DAG: %[[#GenPtrEventTy:]] = OpTypePointer Generic %[[#EventTy]] +; CHECK-SPIRV-DAG: %[[#FunPtrEventTy:]] = OpTypePointer Function %[[#EventTy]] +; CHECK-SPIRV: OpFunction +; CHECK-SPIRV: %[[#Var:]] = OpVariable %[[#]] Function +; CHECK-SPIRV: %[[#ResEvent:]] = OpGroupAsyncCopy %[[#EventTy]] %[[#Scope]] %[[#Null]] %[[#Null]] %[[#Num]] %[[#Stride]] %[[#Null]] +; CHECK-SPIRV: %[[#VarPtrEvent:]] = OpBitcast %[[#FunPtrEventTy]] %[[#Var]] +; CHECK-SPIRV: OpStore %[[#VarPtrEvent]] %[[#ResEvent]] +; CHECK-SPIRV: %[[#VarPtrEvent2:]] = OpBitcast %[[#FunPtrEventTy]] %[[#Var]] +; CHECK-SPIRV: %[[#PtrEventGen:]] = OpPtrCastToGeneric %[[#]] %[[#VarPtrEvent2]] +; CHECK-SPIRV: OpGroupWaitEvents %[[#Scope]] %[[#Num]] %[[#PtrEventGen]] +; CHECK-SPIRV: OpFunctionEnd + +define spir_kernel void @foo() { + %event = alloca ptr, align 8 + %call = call spir_func ptr @_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khmm9ocl_event(ptr null, ptr null, i64 123, i64 1, ptr null) + store ptr %call, ptr %event, align 8 + %event.ascast = addrspacecast ptr %event to ptr addrspace(4) + call spir_func void @_Z17wait_group_eventsiPU3AS49ocl_event(i64 123, ptr addrspace(4) %event.ascast) + ret void +} + +declare spir_func ptr @_Z29async_work_group_strided_copyPU3AS3hPU3AS1Khmm9ocl_event(ptr, ptr, i64, i64, ptr) +declare spir_func void @_Z17wait_group_eventsiPU3AS49ocl_event(i64, ptr addrspace(4)) diff --git a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll index 824ca1b2..6f61aba 100644 --- a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll +++ b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll @@ -1,5 +1,5 @@ ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV -; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} ; CHECK-SPIRV: %[[#IMAGE_TYPE:]] = OpTypeImage ; CHECK-SPIRV: %[[#IMAGE_ARG:]] = OpFunctionParameter %[[#IMAGE_TYPE]] diff --git a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll index f9b3757..6392452 100644 --- a/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll +++ b/llvm/test/CodeGen/Thumb2/mve-tailpred-vptblock.ll @@ -20,50 +20,35 @@ define void @convert_vptblock(ptr %pchTarget, i16 signext %iTargetStride, ptr %p ; CHECK-NEXT: mov.w r8, #0 ; CHECK-NEXT: ldrd r4, r5, [sp, #88] ; CHECK-NEXT: mov r7, r0 -; CHECK-NEXT: cmp.w r10, #8 -; CHECK-NEXT: mov.w r0, #1 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov.w r11, #0 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r3, #8 ; CHECK-NEXT: vidup.u16 q0, r8, #4 -; CHECK-NEXT: sub.w r3, r10, r3 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: adds r3, #7 ; CHECK-NEXT: vmov.i16 q2, #0x100 ; CHECK-NEXT: vmov.i16 q3, #0xff -; CHECK-NEXT: add.w r9, r0, r3, lsr #3 ; CHECK-NEXT: .LBB0_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_3 Depth 2 -; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: vmov q4, q0 ; CHECK-NEXT: mov r6, r8 ; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: dls lr, r9 +; CHECK-NEXT: dlstp.16 lr, r10 ; CHECK-NEXT: .LBB0_3: @ %do.body ; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vctp.16 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q5, [r2, q4] +; CHECK-NEXT: vldrb.u16 q5, [r2, q4] ; CHECK-NEXT: vmul.i16 q4, q5, r5 ; CHECK-NEXT: vshr.u16 q4, q4, #8 ; CHECK-NEXT: vsub.i16 q5, q2, q4 ; CHECK-NEXT: vpt.i16 eq, q4, q3 ; CHECK-NEXT: vmovt q5, q1 -; CHECK-NEXT: vctp.16 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u16 q6, [r0] +; CHECK-NEXT: vldrb.u16 q6, [r0] ; CHECK-NEXT: vsub.i16 q4, q2, q5 -; CHECK-NEXT: subs r3, #8 ; CHECK-NEXT: vmul.i16 q5, q5, q6 ; CHECK-NEXT: vmla.i16 q5, q4, r4 ; CHECK-NEXT: vshr.u16 q4, q5, #8 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.16 q4, [r0], #8 +; CHECK-NEXT: vstrb.16 q4, [r0], #8 ; CHECK-NEXT: vidup.u16 q4, r6, #4 -; CHECK-NEXT: le lr, .LBB0_3 +; CHECK-NEXT: letp lr, .LBB0_3 ; CHECK-NEXT: @ %bb.4: @ %do.end ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 ; CHECK-NEXT: add.w r0, r11, #1 diff --git a/llvm/test/CodeGen/WebAssembly/expand-variadic-call.ll b/llvm/test/CodeGen/WebAssembly/expand-variadic-call.ll new file mode 100644 index 0000000..80f3db0 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/expand-variadic-call.ll @@ -0,0 +1,484 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature +; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s +; REQUIRES: webassembly-registered-target +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-unknown" + +; Check the variables are lowered to the locations this target expects + +; The types show the call frames +; CHECK: %single_i32.vararg = type <{ i32 }> +; CHECK: %single_double.vararg = type <{ double }> +; CHECK: %single_v4f32.vararg = type <{ <4 x float> }> +; CHECK: %single_v8f32.vararg = type <{ <8 x float> }> +; CHECK: %single_v16f32.vararg = type <{ <16 x float> }> +; CHECK: %single_v32f32.vararg = type <{ <32 x float> }> +; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }> +; CHECK: %double_i32.vararg = type <{ double, i32 }> +; CHECK: %i32_libcS.vararg = type <{ i32, ptr }> +; CHECK: %libcS_i32.vararg = type <{ ptr, i32 }> +; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }> +; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }> +; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }> +; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }> +; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }> +; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }> +; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }> +; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }> +; CHECK: %fptr_single_i32.vararg = type <{ i32 }> +; CHECK: %fptr_libcS.vararg = type <{ ptr }> + +%struct.libcS = type { i8, i16, i32, i32, float, double } + +@vararg_ptr = hidden global ptr @vararg, align 4 + +define hidden void @copy(ptr noundef %va) { +; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va.addr = alloca ptr, align 4 +; CHECK-NEXT: %cp = alloca ptr, align 4 +; CHECK-NEXT: store ptr %va, ptr %va.addr, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false) +; CHECK-NEXT: %0 = load ptr, ptr %cp, align 4 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) +; CHECK-NEXT: ret void +; +entry: + %va.addr = alloca ptr, align 4 + %cp = alloca ptr, align 4 + store ptr %va, ptr %va.addr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) + call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr) + %0 = load ptr, ptr %cp, align 4 + call void @valist(ptr noundef %0) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) + ret void +} + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +declare void @llvm.va_copy.p0(ptr, ptr) + +declare void @valist(ptr noundef) + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +define hidden void @start_once(...) { +; CHECK-LABEL: define {{[^@]+}}@start_once(ptr %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %s = alloca ptr, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) +; CHECK-NEXT: store ptr %varargs, ptr %s, align 4 +; CHECK-NEXT: %0 = load ptr, ptr %s, align 4 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) +; CHECK-NEXT: ret void +; +entry: + %s = alloca ptr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) + call void @llvm.va_start.p0(ptr nonnull %s) + %0 = load ptr, ptr %s, align 4 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) + ret void +} + +declare void @llvm.va_start.p0(ptr) + +declare void @llvm.va_end.p0(ptr) + +define hidden void @start_twice(...) { +; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %s0 = alloca ptr, align 4 +; CHECK-NEXT: %s1 = alloca ptr, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) +; CHECK-NEXT: store ptr %varargs, ptr %s0, align 4 +; CHECK-NEXT: %0 = load ptr, ptr %s0, align 4 +; CHECK-NEXT: call void @valist(ptr noundef %0) +; CHECK-NEXT: store ptr %varargs, ptr %s1, align 4 +; CHECK-NEXT: %1 = load ptr, ptr %s1, align 4 +; CHECK-NEXT: call void @valist(ptr noundef %1) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) +; CHECK-NEXT: ret void +; +entry: + %s0 = alloca ptr, align 4 + %s1 = alloca ptr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) + call void @llvm.va_start.p0(ptr nonnull %s0) + %0 = load ptr, ptr %s0, align 4 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s0) + call void @llvm.va_start.p0(ptr nonnull %s1) + %1 = load ptr, ptr %s1, align 4 + call void @valist(ptr noundef %1) + call void @llvm.va_end.p0(ptr %s1) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) + ret void +} + +define hidden void @single_i32(i32 noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_i32.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x) + ret void +} + +declare void @vararg(...) + +define hidden void @single_double(double noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_double.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store double %x, ptr %0, align 8 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(double noundef %x) + ret void +} + +define hidden void @single_v4f32(<4 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v4f32.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <4 x float> %x, ptr %0, align 16 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<4 x float> noundef %x) + ret void +} + +define hidden void @single_v8f32(<8 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v8f32.vararg, align 32 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <8 x float> %x, ptr %0, align 32 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<8 x float> noundef %x) + ret void +} + +define hidden void @single_v16f32(<16 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v16f32.vararg, align 64 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <16 x float> %x, ptr %0, align 64 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<16 x float> noundef %x) + ret void +} + +define hidden void @single_v32f32(<32 x float> noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %single_v32f32.vararg, align 128 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <32 x float> %x, ptr %0, align 128 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<32 x float> noundef %x) + ret void +} + +define hidden void @i32_double(i32 noundef %x, double noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_double.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2 +; CHECK-NEXT: store double %y, ptr %1, align 8 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, double noundef %y) + ret void +} + +define hidden void @double_i32(double noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %double_i32.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store double %x, ptr %0, align 8 +; CHECK-NEXT: %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(double noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %IndirectAlloca = alloca %struct.libcS, align 8 +; CHECK-NEXT: %vararg_buffer = alloca %i32_libcS.vararg, align 16 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr %IndirectAlloca, ptr %y, i64 24, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store ptr %IndirectAlloca, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) + ret void +} + +define hidden void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %IndirectAlloca = alloca %struct.libcS, align 8 +; CHECK-NEXT: %vararg_buffer = alloca %libcS_i32.vararg, align 16 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr %IndirectAlloca, ptr %x, i64 24, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store ptr %IndirectAlloca, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v4f32.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2 +; CHECK-NEXT: store <4 x float> %y, ptr %1, align 16 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) + ret void +} + +define hidden void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v4f32_i32.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <4 x float> %x, ptr %0, align 16 +; CHECK-NEXT: %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v8f32.vararg, align 32 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2 +; CHECK-NEXT: store <8 x float> %y, ptr %1, align 32 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) + ret void +} + +define hidden void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v8f32_i32.vararg, align 32 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <8 x float> %x, ptr %0, align 32 +; CHECK-NEXT: %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v16f32.vararg, align 64 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2 +; CHECK-NEXT: store <16 x float> %y, ptr %1, align 64 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) + ret void +} + +define hidden void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v16f32_i32.vararg, align 64 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <16 x float> %x, ptr %0, align 64 +; CHECK-NEXT: %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %i32_v32f32.vararg, align 128 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2 +; CHECK-NEXT: store <32 x float> %y, ptr %1, align 128 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) + ret void +} + +define hidden void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) { +; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %v32f32_i32.vararg, align 128 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store <32 x float> %x, ptr %0, align 128 +; CHECK-NEXT: %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store i32 %y, ptr %1, align 4 +; CHECK-NEXT: call void @vararg(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) + ret void +} + +define hidden void @fptr_single_i32(i32 noundef %x) { +; CHECK-LABEL: define {{[^@]+}}@fptr_single_i32(i32 noundef %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %fptr_single_i32.vararg, align 16 +; CHECK-NEXT: %0 = load volatile ptr, ptr @vararg_ptr, align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: %1 = getelementptr inbounds %fptr_single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %1, align 4 +; CHECK-NEXT: call void %0(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr @vararg_ptr, align 4 + tail call void (...) %0(i32 noundef %x) + ret void +} + +define hidden void @fptr_libcS(ptr noundef byval(%struct.libcS) align 8 %x) { +; CHECK-LABEL: define {{[^@]+}}@fptr_libcS(ptr noundef byval(%struct.libcS) align 8 %x) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %IndirectAlloca = alloca %struct.libcS, align 8 +; CHECK-NEXT: %vararg_buffer = alloca %fptr_libcS.vararg, align 16 +; CHECK-NEXT: %0 = load volatile ptr, ptr @vararg_ptr, align 4 +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr %IndirectAlloca, ptr %x, i64 24, i1 false) +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: %1 = getelementptr inbounds %fptr_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store ptr %IndirectAlloca, ptr %1, align 4 +; CHECK-NEXT: call void %0(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer) +; CHECK-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr @vararg_ptr, align 4 + tail call void (...) %0(ptr noundef nonnull byval(%struct.libcS) align 8 %x) + ret void +} + diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index 761a754..67388b6 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -11788,27 +11788,35 @@ define <4 x float> @minnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: minnum_intrinsic_v4f32: ; NO-SIMD128: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8 -; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7 -; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6 -; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5 -; NO-SIMD128-NEXT: f32.store 0($0), $pop3 +; NO-SIMD128-NEXT: f32.lt $push0=, $4, $8 +; NO-SIMD128-NEXT: f32.select $push1=, $4, $8, $pop0 +; NO-SIMD128-NEXT: f32.store 12($0), $pop1 +; NO-SIMD128-NEXT: f32.lt $push2=, $3, $7 +; NO-SIMD128-NEXT: f32.select $push3=, $3, $7, $pop2 +; NO-SIMD128-NEXT: f32.store 8($0), $pop3 +; NO-SIMD128-NEXT: f32.lt $push4=, $2, $6 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $6, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.lt $push6=, $1, $5 +; NO-SIMD128-NEXT: f32.select $push7=, $1, $5, $pop6 +; NO-SIMD128-NEXT: f32.store 0($0), $pop7 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: minnum_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype minnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5 -; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.lt $push0=, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.select $push1=, $1, $5, $pop0 +; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 +; NO-SIMD128-FAST-NEXT: f32.lt $push2=, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.select $push3=, $2, $6, $pop2 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.lt $push4=, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $3, $7, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.lt $push6=, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.select $push7=, $4, $8, $pop6 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop7 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) ret <4 x float> %a @@ -11830,26 +11838,26 @@ define <4 x float> @minnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: minnum_nsz_intrinsic_v4f32: ; NO-SIMD128: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fminf, $4, $8 +; NO-SIMD128-NEXT: f32.min $push0=, $4, $8 ; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fminf, $3, $7 +; NO-SIMD128-NEXT: f32.min $push1=, $3, $7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fminf, $2, $6 +; NO-SIMD128-NEXT: f32.min $push2=, $2, $6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fminf, $1, $5 +; NO-SIMD128-NEXT: f32.min $push3=, $1, $5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop3 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: minnum_nsz_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype minnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fminf, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.min $push0=, $1, $5 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $2, $6 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $3, $7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.min $push3=, $4, $8 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 ; NO-SIMD128-FAST-NEXT: return %a = call nnan nsz <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -11875,16 +11883,16 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128: .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fminf, $4, $pop0 +; NO-SIMD128-NEXT: f32.min $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 ; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fminf, $3, $pop7 +; NO-SIMD128-NEXT: f32.min $push2=, $3, $pop7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push3=, fminf, $2, $pop6 +; NO-SIMD128-NEXT: f32.min $push3=, $2, $pop6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop3 ; NO-SIMD128-NEXT: f32.const $push5=, -0x1p0 -; NO-SIMD128-NEXT: call $push4=, fminf, $1, $pop5 +; NO-SIMD128-NEXT: f32.min $push4=, $1, $pop5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop4 ; NO-SIMD128-NEXT: return ; @@ -11892,16 +11900,16 @@ define <4 x float> @fminnumv432_non_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128-FAST: .functype fminnumv432_non_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push2=, fminf, $2, $pop7 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $2, $pop7 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop2 ; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $3, $pop6 +; NO-SIMD128-FAST-NEXT: f32.min $push3=, $3, $pop6 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop3 ; NO-SIMD128-FAST-NEXT: f32.const $push5=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fminf, $4, $pop5 +; NO-SIMD128-FAST-NEXT: f32.min $push4=, $4, $pop5 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop4 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float -1.0, float -1.0, float -1.0>) @@ -11979,34 +11987,38 @@ define <4 x float> @fminnumv432_one_zero_intrinsic(<4 x float> %x) { ; NO-SIMD128: .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fminf, $4, $pop0 +; NO-SIMD128-NEXT: f32.min $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 -; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fminf, $3, $pop7 +; NO-SIMD128-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-NEXT: f32.min $push2=, $3, $pop9 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x0p0 -; NO-SIMD128-NEXT: call $push4=, fminf, $2, $pop3 -; NO-SIMD128-NEXT: f32.store 4($0), $pop4 -; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fminf, $1, $pop6 -; NO-SIMD128-NEXT: f32.store 0($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-NEXT: f32.lt $push4=, $2, $pop8 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 +; NO-SIMD128-NEXT: f32.min $push6=, $1, $pop7 +; NO-SIMD128-NEXT: f32.store 0($0), $pop6 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: fminnumv432_one_zero_intrinsic: ; NO-SIMD128-FAST: .functype fminnumv432_one_zero_intrinsic (i32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fminf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.min $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 -; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x0p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fminf, $2, $pop2 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-FAST-NEXT: f32.min $push2=, $3, $pop9 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 +; NO-SIMD128-FAST-NEXT: f32.const $push3=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.lt $push4=, $2, $pop8 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop5 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fminf, $3, $pop7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 -; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fminf, $4, $pop6 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.min $push6=, $4, $pop7 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop6 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>) ret <4 x float> %a @@ -12126,27 +12138,35 @@ define <4 x float> @maxnum_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: maxnum_intrinsic_v4f32: ; NO-SIMD128: .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fmaxf, $4, $8 -; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $3, $7 -; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $2, $6 -; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fmaxf, $1, $5 -; NO-SIMD128-NEXT: f32.store 0($0), $pop3 +; NO-SIMD128-NEXT: f32.gt $push0=, $4, $8 +; NO-SIMD128-NEXT: f32.select $push1=, $4, $8, $pop0 +; NO-SIMD128-NEXT: f32.store 12($0), $pop1 +; NO-SIMD128-NEXT: f32.gt $push2=, $3, $7 +; NO-SIMD128-NEXT: f32.select $push3=, $3, $7, $pop2 +; NO-SIMD128-NEXT: f32.store 8($0), $pop3 +; NO-SIMD128-NEXT: f32.gt $push4=, $2, $6 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $6, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.gt $push6=, $1, $5 +; NO-SIMD128-NEXT: f32.select $push7=, $1, $5, $pop6 +; NO-SIMD128-NEXT: f32.store 0($0), $pop7 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fmaxf, $1, $5 -; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $2, $6 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fmaxf, $3, $7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $4, $8 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.gt $push0=, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.select $push1=, $1, $5, $pop0 +; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 +; NO-SIMD128-FAST-NEXT: f32.gt $push2=, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.select $push3=, $2, $6, $pop2 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.gt $push4=, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $3, $7, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.gt $push6=, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.select $push7=, $4, $8, $pop6 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop7 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) ret <4 x float> %a @@ -12168,26 +12188,26 @@ define <4 x float> @maxnum_nsz_intrinsic_v4f32(<4 x float> %x, <4 x float> %y) { ; NO-SIMD128-LABEL: maxnum_nsz_intrinsic_v4f32: ; NO-SIMD128: .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: call $push0=, fmaxf, $4, $8 +; NO-SIMD128-NEXT: f32.max $push0=, $4, $8 ; NO-SIMD128-NEXT: f32.store 12($0), $pop0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $3, $7 +; NO-SIMD128-NEXT: f32.max $push1=, $3, $7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop1 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $2, $6 +; NO-SIMD128-NEXT: f32.max $push2=, $2, $6 ; NO-SIMD128-NEXT: f32.store 4($0), $pop2 -; NO-SIMD128-NEXT: call $push3=, fmaxf, $1, $5 +; NO-SIMD128-NEXT: f32.max $push3=, $1, $5 ; NO-SIMD128-NEXT: f32.store 0($0), $pop3 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_nsz_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_nsz_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: call $push0=, fmaxf, $1, $5 +; NO-SIMD128-FAST-NEXT: f32.max $push0=, $1, $5 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $2, $6 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $2, $6 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop1 -; NO-SIMD128-FAST-NEXT: call $push2=, fmaxf, $3, $7 +; NO-SIMD128-FAST-NEXT: f32.max $push2=, $3, $7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $4, $8 +; NO-SIMD128-FAST-NEXT: f32.max $push3=, $4, $8 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop3 ; NO-SIMD128-FAST-NEXT: return %a = call nnan nsz <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) @@ -12265,34 +12285,38 @@ define <4 x float> @maxnum_one_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128: .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $4, $pop0 +; NO-SIMD128-NEXT: f32.max $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 -; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $3, $pop7 +; NO-SIMD128-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-NEXT: f32.max $push2=, $3, $pop9 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x0p0 -; NO-SIMD128-NEXT: call $push4=, fmaxf, $2, $pop3 -; NO-SIMD128-NEXT: f32.store 4($0), $pop4 -; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fmaxf, $1, $pop6 -; NO-SIMD128-NEXT: f32.store 0($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-NEXT: f32.gt $push4=, $2, $pop8 +; NO-SIMD128-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-NEXT: f32.store 4($0), $pop5 +; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 +; NO-SIMD128-NEXT: f32.max $push6=, $1, $pop7 +; NO-SIMD128-NEXT: f32.store 0($0), $pop6 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: maxnum_one_zero_intrinsic_v4f32: ; NO-SIMD128-FAST: .functype maxnum_one_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 -; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x0p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $2, $pop2 -; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 +; NO-SIMD128-FAST-NEXT: f32.const $push9=, -0x1p0 +; NO-SIMD128-FAST-NEXT: f32.max $push2=, $3, $pop9 +; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop2 +; NO-SIMD128-FAST-NEXT: f32.const $push3=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.const $push8=, 0x0p0 +; NO-SIMD128-FAST-NEXT: f32.gt $push4=, $2, $pop8 +; NO-SIMD128-FAST-NEXT: f32.select $push5=, $2, $pop3, $pop4 +; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop5 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fmaxf, $3, $pop7 -; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 -; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fmaxf, $4, $pop6 -; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 +; NO-SIMD128-FAST-NEXT: f32.max $push6=, $4, $pop7 +; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop6 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 0.0, float -1.0, float -1.0>) ret <4 x float> %a @@ -12317,16 +12341,16 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128: .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-NEXT: # %bb.0: ; NO-SIMD128-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-NEXT: call $push1=, fmaxf, $4, $pop0 +; NO-SIMD128-NEXT: f32.max $push1=, $4, $pop0 ; NO-SIMD128-NEXT: f32.store 12($0), $pop1 ; NO-SIMD128-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-NEXT: call $push2=, fmaxf, $3, $pop7 +; NO-SIMD128-NEXT: f32.max $push2=, $3, $pop7 ; NO-SIMD128-NEXT: f32.store 8($0), $pop2 ; NO-SIMD128-NEXT: f32.const $push3=, 0x1p0 -; NO-SIMD128-NEXT: call $push4=, fmaxf, $2, $pop3 +; NO-SIMD128-NEXT: f32.max $push4=, $2, $pop3 ; NO-SIMD128-NEXT: f32.store 4($0), $pop4 ; NO-SIMD128-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-NEXT: call $push5=, fmaxf, $1, $pop6 +; NO-SIMD128-NEXT: f32.max $push5=, $1, $pop6 ; NO-SIMD128-NEXT: f32.store 0($0), $pop5 ; NO-SIMD128-NEXT: return ; @@ -12334,16 +12358,16 @@ define <4 x float> @maxnum_non_zero_intrinsic_v4f32(<4 x float> %x, <4 x float> ; NO-SIMD128-FAST: .functype maxnum_non_zero_intrinsic_v4f32 (i32, f32, f32, f32, f32, f32, f32, f32, f32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: ; NO-SIMD128-FAST-NEXT: f32.const $push0=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push1=, fmaxf, $1, $pop0 +; NO-SIMD128-FAST-NEXT: f32.max $push1=, $1, $pop0 ; NO-SIMD128-FAST-NEXT: f32.store 0($0), $pop1 ; NO-SIMD128-FAST-NEXT: f32.const $push2=, 0x1p0 -; NO-SIMD128-FAST-NEXT: call $push3=, fmaxf, $2, $pop2 +; NO-SIMD128-FAST-NEXT: f32.max $push3=, $2, $pop2 ; NO-SIMD128-FAST-NEXT: f32.store 4($0), $pop3 ; NO-SIMD128-FAST-NEXT: f32.const $push7=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push4=, fmaxf, $3, $pop7 +; NO-SIMD128-FAST-NEXT: f32.max $push4=, $3, $pop7 ; NO-SIMD128-FAST-NEXT: f32.store 8($0), $pop4 ; NO-SIMD128-FAST-NEXT: f32.const $push6=, -0x1p0 -; NO-SIMD128-FAST-NEXT: call $push5=, fmaxf, $4, $pop6 +; NO-SIMD128-FAST-NEXT: f32.max $push5=, $4, $pop6 ; NO-SIMD128-FAST-NEXT: f32.store 12($0), $pop5 ; NO-SIMD128-FAST-NEXT: return %a = call nnan <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float><float -1.0, float 1.0, float -1.0, float -1.0>) diff --git a/llvm/test/CodeGen/WebAssembly/vararg-frame.ll b/llvm/test/CodeGen/WebAssembly/vararg-frame.ll new file mode 100644 index 0000000..5c76040 --- /dev/null +++ b/llvm/test/CodeGen/WebAssembly/vararg-frame.ll @@ -0,0 +1,526 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -verify-machineinstrs | FileCheck %s +; REQUIRES: webassembly-registered-target +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-unknown" + +; Function Attrs: nounwind +define void @pass_s0() { +; CHECK-LABEL: pass_s0: +; CHECK: .functype pass_s0 () -> () +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: i32.const 0 +; CHECK-NEXT: call sink +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink() + ret void +} + +declare void @sink(...) + +; Function Attrs: nounwind +define void @pass_s1(i8 %x.coerce) { +; CHECK-LABEL: pass_s1: +; CHECK: .functype pass_s1 (i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i8 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_s2(i16 %x.coerce) { +; CHECK-LABEL: pass_s2: +; CHECK: .functype pass_s2 (i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i16 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_s3(i32 %x.coerce) { +; CHECK-LABEL: pass_s3: +; CHECK: .functype pass_s3 (i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_s4(i64 %x.coerce) { +; CHECK-LABEL: pass_s4: +; CHECK: .functype pass_s4 (i64) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i64 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_s5(<4 x i32> noundef %x) { +; CHECK-LABEL: pass_s5: +; CHECK: .functype pass_s5 (i32, i32, i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 4 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 12 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(<4 x i32> noundef %x) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s0(i32 noundef %i) { +; CHECK-LABEL: pass_int_s0: +; CHECK: .functype pass_int_s0 (i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 1 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s1(i32 noundef %i, i8 %x.coerce) { +; CHECK-LABEL: pass_int_s1: +; CHECK: .functype pass_int_s1 (i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, i8 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s2(i32 noundef %i, i16 %x.coerce) { +; CHECK-LABEL: pass_int_s2: +; CHECK: .functype pass_int_s2 (i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, i16 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s3(i32 noundef %i, i32 %x.coerce) { +; CHECK-LABEL: pass_int_s3: +; CHECK: .functype pass_int_s3 (i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, i32 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s4(i32 noundef %i, i64 %x.coerce) { +; CHECK-LABEL: pass_int_s4: +; CHECK: .functype pass_int_s4 (i32, i64) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 2 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i64.store 8 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, i64 %x.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_int_s5(i32 noundef %i, <4 x i32> noundef %x) { +; CHECK-LABEL: pass_int_s5: +; CHECK: .functype pass_int_s5 (i32, i32, i32, i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 5 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 28 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 20 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, <4 x i32> noundef %x) + ret void +} + +; Function Attrs: nounwind +define void @pass_asc(i8 %x1.coerce, i16 %x2.coerce, i32 %x3.coerce, i64 %x4.coerce, <4 x i32> noundef %x5) { +; CHECK-LABEL: pass_asc: +; CHECK: .functype pass_asc (i32, i32, i32, i64, i32, i32, i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 8 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 44 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 40 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 36 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i8 %x1.coerce, i16 %x2.coerce, i32 %x3.coerce, i64 %x4.coerce, <4 x i32> noundef %x5) + ret void +} + +; Function Attrs: nounwind +define void @pass_dsc(<4 x i32> noundef %x0, i64 %x1.coerce, i32 %x2.coerce, i16 %x3.coerce, i8 %x4.coerce) { +; CHECK-LABEL: pass_dsc: +; CHECK: .functype pass_dsc (i32, i32, i32, i32, i64, i32, i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 8 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 28 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i64.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 12 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(<4 x i32> noundef %x0, i64 %x1.coerce, i32 %x2.coerce, i16 %x3.coerce, i8 %x4.coerce) + ret void +} + +; Function Attrs: nounwind +define void @pass_multiple(i32 noundef %i, i8 %x1.coerce, i16 %x2.coerce, i32 %x3.coerce, i64 %x4.coerce, <4 x i32> noundef %x5) { +; CHECK-LABEL: pass_multiple: +; CHECK: .functype pass_multiple (i32, i32, i32, i32, i64, i32, i32, i32, i32) -> () +; CHECK-NEXT: .local i32 +; CHECK-NEXT: # %bb.0: # %entry +; CHECK-NEXT: global.get __stack_pointer +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.sub +; CHECK-NEXT: local.tee 9 +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 4 +; CHECK-NEXT: i64.store 40 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 2 +; CHECK-NEXT: i32.store 36 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 32 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 32 +; CHECK-NEXT: i32.add +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 28 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 8 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 24 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 7 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 20 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 6 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 16 +; CHECK-NEXT: i32.add +; CHECK-NEXT: local.get 5 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 3 +; CHECK-NEXT: i32.store 8 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 1 +; CHECK-NEXT: i32.store 4 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: local.get 0 +; CHECK-NEXT: i32.store 0 +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: call sink +; CHECK-NEXT: local.get 9 +; CHECK-NEXT: i32.const 48 +; CHECK-NEXT: i32.add +; CHECK-NEXT: global.set __stack_pointer +; CHECK-NEXT: # fallthrough-return +entry: + tail call void (...) @sink(i32 noundef %i, i16 %x2.coerce, i64 %x4.coerce) + tail call void (...) @sink(i32 noundef %i, i8 %x1.coerce, i32 %x3.coerce, <4 x i32> noundef %x5) + ret void +} + diff --git a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll index 1782e52..55b86ca 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll @@ -28,35 +28,17 @@ define half @test_fminimum(half %x, half %y) { define <8 x half> @test_fminimum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { ; CHECK-LABEL: test_fminimum_scalarize: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vminsh %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] -; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] -; CHECK-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] -; CHECK-NEXT: vminsh %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 -; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm4 -; CHECK-NEXT: vminsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: vminsh %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 -; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: vcmpltph %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm3 = [32768,32768,32768,32768,32768,32768,32768,32768] +; CHECK-NEXT: vpcmpeqw %xmm3, %xmm0, %k1 +; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vpcmpeqw %xmm3, %xmm1, %k1 +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1 +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %r = call <8 x half> @llvm.minimum.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %r @@ -134,35 +116,16 @@ define half @test_fmaximum(half %x, half %y) { define <8 x half> @test_fmaximum_scalarize(<8 x half> %x, <8 x half> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { ; CHECK-LABEL: test_fmaximum_scalarize: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vmaxsh %xmm2, %xmm3, %xmm2 -; CHECK-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,3,3,3] -; CHECK-NEXT: vshufps {{.*#+}} xmm4 = xmm0[3,3,3,3] -; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0] -; CHECK-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0] -; CHECK-NEXT: vmaxsh %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; CHECK-NEXT: vpsrlq $48, %xmm1, %xmm3 -; CHECK-NEXT: vpsrlq $48, %xmm0, %xmm4 -; CHECK-NEXT: vmaxsh %xmm3, %xmm4, %xmm3 -; CHECK-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] -; CHECK-NEXT: vmovshdup {{.*#+}} xmm5 = xmm0[1,1,3,3] -; CHECK-NEXT: vmaxsh %xmm4, %xmm5, %xmm4 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vpsrld $16, %xmm1, %xmm1 -; CHECK-NEXT: vpsrld $16, %xmm0, %xmm0 -; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] +; CHECK-NEXT: vcmpltph %xmm0, %xmm1, %k1 +; CHECK-NEXT: vpblendmw %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vptestnmw %xmm0, %xmm0, %k1 +; CHECK-NEXT: vpblendmw %xmm0, %xmm2, %xmm0 {%k1} +; CHECK-NEXT: vptestnmw %xmm1, %xmm1, %k1 +; CHECK-NEXT: vmovdqu16 %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vcmpeqph %xmm1, %xmm2, %k1 +; CHECK-NEXT: vmovdqu16 %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ; CHECK-NEXT: retq %r = call <8 x half> @llvm.maximum.v8f16(<8 x half> %x, <8 x half> %y) ret <8 x half> %r diff --git a/llvm/test/CodeGen/X86/llvm.tan.ll b/llvm/test/CodeGen/X86/llvm.tan.ll new file mode 100644 index 0000000..24b3003 --- /dev/null +++ b/llvm/test/CodeGen/X86/llvm.tan.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define half @use_tanf16(half %a) nounwind { +; CHECK-LABEL: use_tanf16: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq __extendhfsf2@PLT +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: callq __truncsfhf2@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %x = call half @llvm.tan.f16(half %a) + ret half %x +} + +define float @use_tanf32(float %a) nounwind { +; CHECK-LABEL: use_tanf32: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp tanf@PLT # TAILCALL + %x = call float @llvm.tan.f32(float %a) + ret float %x +} + +define double @use_tanf64(double %a) nounwind { +; CHECK-LABEL: use_tanf64: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp tan@PLT # TAILCALL + %x = call double @llvm.tan.f64(double %a) + ret double %x +} + +define x86_fp80 @use_tanf80(x86_fp80 %a) nounwind { +; CHECK-LABEL: use_tanf80: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: fldt 32(%rsp) +; CHECK-NEXT: fstpt (%rsp) +; CHECK-NEXT: callq tanl@PLT +; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: retq + %x = call x86_fp80 @llvm.tan.f80(x86_fp80 %a) + ret x86_fp80 %x +} + +define fp128 @use_tanfp128(fp128 %a) nounwind { +; CHECK-LABEL: use_tanfp128: +; CHECK: # %bb.0: +; CHECK-NEXT: jmp tanf128@PLT # TAILCALL + %x = call fp128 @llvm.tan.f128(fp128 %a) + ret fp128 %x +} + +define ppc_fp128 @use_tanppc_fp128(ppc_fp128 %a) nounwind { +; CHECK-LABEL: use_tanppc_fp128: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq tanl@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %x = call ppc_fp128 @llvm.tan.ppcf128(ppc_fp128 %a) + ret ppc_fp128 %x +} + +declare half @llvm.tan.f16(half) +declare float @llvm.tan.f32(float) +declare double @llvm.tan.f64(double) +declare x86_fp80 @llvm.tan.f80(x86_fp80) +declare fp128 @llvm.tan.f128(fp128) +declare ppc_fp128 @llvm.tan.ppcf128(ppc_fp128) diff --git a/llvm/test/CodeGen/X86/vec-libcalls.ll b/llvm/test/CodeGen/X86/vec-libcalls.ll index 3a13154..6857101 100644 --- a/llvm/test/CodeGen/X86/vec-libcalls.ll +++ b/llvm/test/CodeGen/X86/vec-libcalls.ll @@ -17,6 +17,14 @@ declare <5 x float> @llvm.sin.v5f32(<5 x float>) declare <6 x float> @llvm.sin.v6f32(<6 x float>) declare <3 x double> @llvm.sin.v3f64(<3 x double>) +declare <1 x float> @llvm.tan.v1f32(<1 x float>) +declare <2 x float> @llvm.tan.v2f32(<2 x float>) +declare <3 x float> @llvm.tan.v3f32(<3 x float>) +declare <4 x float> @llvm.tan.v4f32(<4 x float>) +declare <5 x float> @llvm.tan.v5f32(<5 x float>) +declare <6 x float> @llvm.tan.v6f32(<6 x float>) +declare <3 x double> @llvm.tan.v3f64(<3 x double>) + ; Verify that all of the potential libcall candidates are handled. ; Some of these have custom lowering, so those cases won't have ; libcalls. @@ -230,6 +238,200 @@ define <3 x double> @sin_v3f64(<3 x double> %x) nounwind { ret <3 x double> %r } +define <1 x float> @tan_v1f32(<1 x float> %x) nounwind { +; CHECK-LABEL: tan_v1f32: +; CHECK: # %bb.0: +; CHECK-NEXT: pushq %rax +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: popq %rax +; CHECK-NEXT: retq + %r = call <1 x float> @llvm.tan.v1f32(<1 x float> %x) + ret <1 x float> %r +} + +define <2 x float> @tan_v2f32(<2 x float> %x) nounwind { +; CHECK-LABEL: tan_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %r = call <2 x float> @llvm.tan.v2f32(<2 x float> %x) + ret <2 x float> %r +} + +define <3 x float> @tan_v3f32(<3 x float> %x) nounwind { +; CHECK-LABEL: tan_v3f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %r = call <3 x float> @llvm.tan.v3f32(<3 x float> %x) + ret <3 x float> %r +} + +define <4 x float> @tan_v4f32(<4 x float> %x) nounwind { +; CHECK-LABEL: tan_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $40, %rsp +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: addq $40, %rsp +; CHECK-NEXT: retq + %r = call <4 x float> @llvm.tan.v4f32(<4 x float> %x) + ret <4 x float> %r +} + +define <5 x float> @tan_v5f32(<5 x float> %x) nounwind { +; CHECK-LABEL: tan_v5f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: retq + %r = call <5 x float> @llvm.tan.v5f32(<5 x float> %x) + ret <5 x float> %r +} + +define <6 x float> @tan_v6f32(<6 x float> %x) nounwind { +; CHECK-LABEL: tan_v6f32: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,1,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[3,3,3,3] +; CHECK-NEXT: callq tanf@PLT +; CHECK-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; CHECK-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: retq + %r = call <6 x float> @llvm.tan.v6f32(<6 x float> %x) + ret <6 x float> %r +} + +define <3 x double> @tan_v3f64(<3 x double> %x) nounwind { +; CHECK-LABEL: tan_v3f64: +; CHECK: # %bb.0: +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill +; CHECK-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload +; CHECK-NEXT: # xmm0 = mem[1,0] +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: vmovapd (%rsp), %xmm1 # 16-byte Reload +; CHECK-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] +; CHECK-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill +; CHECK-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: callq tan@PLT +; CHECK-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: addq $72, %rsp +; CHECK-NEXT: retq + %r = call <3 x double> @llvm.tan.v3f64(<3 x double> %x) + ret <3 x double> %r +} + define <2 x float> @fabs_v2f32(<2 x float> %x) nounwind { ; CHECK-LABEL: fabs_v2f32: ; CHECK: # %bb.0: diff --git a/llvm/test/DebugInfo/X86/sdag-order.ll b/llvm/test/DebugInfo/X86/sdag-order.ll new file mode 100644 index 0000000..f959a80 --- /dev/null +++ b/llvm/test/DebugInfo/X86/sdag-order.ll @@ -0,0 +1,46 @@ +; RUN: llc %s --stop-after=finalize-isel -o - | FileCheck %s + +;; Check the DBG_VALUE which is salvaged from the dbg.value using an otherwised +;; unused value is emitted at the correct position in the function. +;; Prior (-) to patching (+), these DBG_VALUEs would sink to the bottom of the +;; function: +;; │ bb.1.if.then: +;; │- $rax = COPY %1 +;; │ DBG_VALUE 0, $noreg, !9, !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value) +;; │+ $rax = COPY %1 +;; │ RET 0, $rax + +; CHECK: bb.1.if.then: +; CHECK-NEXT: DBG_VALUE 0, $noreg, ![[#]], !DIExpression(DW_OP_plus_uconst, 4, DW_OP_stack_value) + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define void @badger(ptr sret(i64) %sret) !dbg !5 { +entry: + %f.i = getelementptr i8, ptr null, i64 4 + br label %if.then + +if.then: ; preds = %entry + tail call void @llvm.dbg.value(metadata ptr %f.i, metadata !9, metadata !DIExpression()), !dbg !11 + ret void +} + +declare void @llvm.dbg.value(metadata, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.debugify = !{!2, !3} +!llvm.module.flags = !{!4} + +!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!1 = !DIFile(filename: "test.ll", directory: "/") +!2 = !{i32 3} +!3 = !{i32 1} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = distinct !DISubprogram(name: "_ZNK1d1gEv", linkageName: "_ZNK1d1gEv", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8) +!6 = !DISubroutineType(types: !7) +!7 = !{} +!8 = !{!9} +!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10) +!10 = !DIBasicType(name: "ty64", size: 64, encoding: DW_ATE_unsigned) +!11 = !DILocation(line: 5, column: 1, scope: !5) diff --git a/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test b/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test index 5141ff6..9e46570 100644 --- a/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test +++ b/llvm/test/DebugInfo/symbolize-gnu-debuglink-no-realpath.test @@ -1,4 +1,3 @@ -# REQUIRES: shell # Ensure that no realpath assumptions are made about .gnu_debuglink paths. # Copy inputs to some other location with arbitrary names, with the original diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll b/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll index dad5f8e..8610645 100644 --- a/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll +++ b/llvm/test/Instrumentation/HWAddressSanitizer/stack-safety-analysis.ll @@ -1,4 +1,5 @@ -; RUN: opt -mtriple=aarch64-unknown-linux-gnu -passes=hwasan -hwasan-instrument-with-calls -hwasan-use-stack-safety=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=SAFETY,CHECK +; RUN: opt -pass-remarks-output=%t.pass-remarks -mtriple=aarch64-unknown-linux-gnu -passes=hwasan -hwasan-instrument-with-calls -hwasan-use-stack-safety=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=SAFETY,CHECK +; RUN: cat %t.pass-remarks | FileCheck %s --check-prefixes=SAFETY-REMARKS ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -passes=hwasan -hwasan-instrument-with-calls -hwasan-use-stack-safety=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=NOSAFETY,CHECK ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -passes=hwasan -hwasan-instrument-with-calls -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=SAFETY,CHECK ; RUN: opt -mtriple=aarch64-unknown-linux-gnu -passes=hwasan -hwasan-instrument-stack=0 -hwasan-instrument-with-calls -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=NOSTACK,CHECK @@ -20,6 +21,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_simple %buf.sroa.0 = alloca i8, align 4 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) store volatile i8 0, ptr %buf.sroa.0, align 4, !tbaa !8 @@ -37,6 +39,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_cmpxchg %buf.sroa.0 = alloca i8, align 4 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) %0 = cmpxchg ptr %buf.sroa.0, i8 1, i8 2 monotonic monotonic, align 4 @@ -54,6 +57,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_atomicrwm %buf.sroa.0 = alloca i8, align 4 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) %0 = atomicrmw add ptr %buf.sroa.0, i8 1 monotonic, align 4 @@ -71,6 +75,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_use %buf.sroa.0 = alloca i8, align 4 call void @use(ptr nonnull %buf.sroa.0) call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) @@ -89,6 +94,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_in_range %buf.sroa.0 = alloca [10 x i8], align 4 call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %buf.sroa.0) store volatile i8 0, ptr %buf.sroa.0, align 4, !tbaa !8 @@ -106,6 +112,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_in_range2 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %buf.sroa.0) @@ -123,6 +130,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_memset ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memset + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_in_range3 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.memset.p0.i32(ptr %ptr, i8 0, i32 1, i1 true) @@ -138,6 +146,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_memmove ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memmove + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_in_range4 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.memmove.p0.p0.i32(ptr %ptr, ptr %ptr, i32 1, i1 true) @@ -153,6 +162,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_memmove ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memmove + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_in_range5 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 %buf.sroa.1 = alloca [10 x i8], align 4 @@ -171,6 +181,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 10 call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %buf.sroa.0) @@ -188,6 +199,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range2 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 10 call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %buf.sroa.0) @@ -205,6 +217,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_memset ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memset + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range3 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.memset.p0.i32(ptr %ptr, i8 0, i32 2, i1 true) @@ -220,6 +233,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_memmove ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memmove + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range4 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.memmove.p0.p0.i32(ptr %ptr, ptr %ptr, i32 2, i1 true) @@ -235,6 +249,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_memmove ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_memmove + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range5 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 %buf.sroa.1 = alloca [10 x i8], align 4 @@ -256,6 +271,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_out_of_range6 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 10 call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %buf.sroa.0) @@ -275,6 +291,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_potentially_out_of_range %buf.sroa.0 = alloca [10 x i8], align 4 %off = call i32 @getoffset() %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 %off @@ -293,6 +310,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_memmove ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK: call {{.*}}__hwasan_memmove + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_potentially_out_of_range2 %buf.sroa.0 = alloca [10 x i8], align 4 %ptr = getelementptr [10 x i8], ptr %buf.sroa.0, i32 0, i32 9 call void @llvm.memmove.p0.p0.i32(ptr %ptr, ptr %a, i32 1, i1 true) @@ -309,6 +327,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_unclear %buf.sroa.0 = alloca i8, align 4 %ptr = call ptr @getptr(ptr %buf.sroa.0) call void @llvm.lifetime.start.p0(i64 10, ptr nonnull %ptr) @@ -326,6 +345,7 @@ entry: ; SAFETY: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Missed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_select %x = call ptr @getptr(ptr %a) %buf.sroa.0 = alloca i8, align 4 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) @@ -346,6 +366,7 @@ entry: ; SAFETY-NOT: call {{.*}}__hwasan_store ; NOSTACK-NOT: call {{.*}}__hwasan_generate_tag ; NOSTACK-NOT: call {{.*}}__hwasan_store + ; SAFETY-REMARKS: --- !Passed{{[[:space:]]}}Pass: hwasan{{[[:space:]]}}Name: ignoreAccess{{[[:space:]]}}Function: test_retptr %buf.sroa.0 = alloca i8, align 4 call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %buf.sroa.0) %ptr = call ptr @retptr(ptr %buf.sroa.0) diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s index 58b7847..d3f82a2 100644 --- a/llvm/test/MC/AMDGPU/gfx1150_asm_features.s +++ b/llvm/test/MC/AMDGPU/gfx1150_asm_features.s @@ -1,5 +1,6 @@ // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s // RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s +// RUN: llvm-mc -triple=amdgcn -show-encoding -mcpu=gfx1152 %s | FileCheck --check-prefix=GFX1150 %s // // Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable diff --git a/llvm/test/MC/RISCV/relocations.s b/llvm/test/MC/RISCV/relocations.s index 3cad3d4..f5f6417 100644 --- a/llvm/test/MC/RISCV/relocations.s +++ b/llvm/test/MC/RISCV/relocations.s @@ -185,7 +185,7 @@ auipc a0, %tlsdesc_hi(a_symbol) lw a1, %tlsdesc_load_lo(.L5)(a0) # RELOC: R_RISCV_TLSDESC_LOAD_LO12 -# INSTR: a1, %tlsdesc_load_lo(.L5)(a0) +# INSTR: lw a1, %tlsdesc_load_lo(.L5)(a0) # FIXUP: fixup A - offset: 0, value: %tlsdesc_load_lo(.L5), kind: fixup_riscv_tlsdesc_load_lo12 addi a0, a0, %tlsdesc_add_lo(.L5) diff --git a/llvm/test/MC/WebAssembly/reloc-pic64.s b/llvm/test/MC/WebAssembly/reloc-pic64.s index 0f2ebba..4c5ec4f 100644 --- a/llvm/test/MC/WebAssembly/reloc-pic64.s +++ b/llvm/test/MC/WebAssembly/reloc-pic64.s @@ -93,6 +93,7 @@ hidden_func: # CHECK-NEXT: Index: 0 # CHECK-NEXT: ElemType: FUNCREF # CHECK-NEXT: Limits: +# CHECK-NEXT: Flags: [ IS_64 ] # CHECK-NEXT: Minimum: 0x1 # CHECK-NEXT: - Module: GOT.mem # CHECK-NEXT: Field: default_data @@ -109,7 +110,7 @@ hidden_func: # CHECK-NEXT: - Type: ELEM # CHECK-NEXT: Segments: # CHECK-NEXT: Offset: -# CHECK-NEXT: Opcode: I32_CONST +# CHECK-NEXT: Opcode: I64_CONST # CHECK-NEXT: Value: 1 # CHECK-NEXT: Functions: [ 5 ] # CHECK-NEXT: - Type: DATACOUNT diff --git a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml index 7f36795..7512edd 100644 --- a/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml +++ b/llvm/test/Object/AMDGPU/elf-header-flags-mach.yaml @@ -230,6 +230,10 @@ # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1151 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1151 %s # RUN: obj2yaml %t.o.AMDGCN_GFX1151 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1151 %s +# RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1152/' %s | yaml2obj -o %t.o.AMDGCN_GFX1152 +# RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1152 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1152 %s +# RUN: obj2yaml %t.o.AMDGCN_GFX1152 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1152 %s + # RUN: sed -e 's/<BITS>/64/' -e 's/<MACH>/AMDGCN_GFX1200/' %s | yaml2obj -o %t.o.AMDGCN_GFX1200 # RUN: llvm-readobj -S --file-headers %t.o.AMDGCN_GFX1200 | FileCheck --check-prefixes=ELF-AMDGCN-ALL,ELF-AMDGCN-GFX1200 %s # RUN: obj2yaml %t.o.AMDGCN_GFX1200 | FileCheck --check-prefixes=YAML-AMDGCN-ALL,YAML-AMDGCN-GFX1200 %s @@ -450,6 +454,9 @@ # ELF-AMDGCN-GFX1151: EF_AMDGPU_MACH_AMDGCN_GFX1151 (0x4A) # YAML-AMDGCN-GFX1151: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1151 ] +# ELF-AMDGCN-GFX1152: EF_AMDGPU_MACH_AMDGCN_GFX1152 (0x55) +# YAML-AMDGCN-GFX1152: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1152 ] + # ELF-AMDGCN-GFX1200: EF_AMDGPU_MACH_AMDGCN_GFX1200 (0x48) # YAML-AMDGCN-GFX1200: Flags: [ EF_AMDGPU_MACH_AMDGCN_GFX1200 ] diff --git a/llvm/test/Other/can-execute.txt b/llvm/test/Other/can-execute.txt index 46791cb..37626e7 100644 --- a/llvm/test/Other/can-execute.txt +++ b/llvm/test/Other/can-execute.txt @@ -1,5 +1,4 @@ REQUIRES: can-execute -REQUIRES: shell This tests that we abstract two peculiarities of unix in can_execute: diff --git a/llvm/test/Other/lit-unicode.txt b/llvm/test/Other/lit-unicode.txt index 2f400014..b375fc50 100644 --- a/llvm/test/Other/lit-unicode.txt +++ b/llvm/test/Other/lit-unicode.txt @@ -1,5 +1,4 @@ FIXME: See if we can fix this in lit by using Unicode strings. -REQUIRES: shell RUN: echo "ようこそ" | FileCheck %s CHECK: {{^}}ようこそ{{$}} diff --git a/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll b/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll new file mode 100644 index 0000000..2f0b51c --- /dev/null +++ b/llvm/test/Transforms/ConstraintElimination/induction-condition-in-loop-exit.ll @@ -0,0 +1,443 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -p constraint-elimination -S %s | FileCheck %s + +declare void @llvm.assume(i1) + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_known(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_known( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1235 + ret i1 %t +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_start_value(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_start_value( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 1235, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 1235, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1235 + ret i1 %t +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_known_due_to_precond_on_start_value(ptr %s, i32 %start) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_known_due_to_precond_on_start_value( +; CHECK-SAME: ptr [[S:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[PRE_C:%.*]] = icmp ule i32 [[START]], 1234 +; CHECK-NEXT: call void @llvm.assume(i1 [[PRE_C]]) +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + %pre.c = icmp ule i32 %start, 1234 + call void @llvm.assume(i1 %pre.c) + br label %loop.header + +loop.header: + %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1235 + ret i1 %t +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_precond_on_start_value(ptr %s, i32 %start) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_precond_on_start_value( +; CHECK-SAME: ptr [[S:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: [[PRE_C:%.*]] = icmp ule i32 [[START]], 1236 +; CHECK-NEXT: call void @llvm.assume(i1 [[PRE_C]]) +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1236 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + %pre.c = icmp ule i32 %start, 1236 + call void @llvm.assume(i1 %pre.c) + br label %loop.header + +loop.header: + %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1236 + ret i1 %t +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_missing_precond(ptr %s, i32 %start) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known_due_to_missing_precond( +; CHECK-SAME: ptr [[S:%.*]], i32 [[START:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1236 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ %start, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1236 + ret i1 %t +} + +define i1 @multi_exiting_loop_eq_same_exit_with_out_loop_preds_const_compare_not_known(ptr %s, i1 %pre.c, i32 %x) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_exit_with_out_loop_preds_const_compare_not_known( +; CHECK-SAME: ptr [[S:%.*]], i1 [[PRE_C:%.*]], i32 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 [[PRE_C]], label %[[LOOP_HEADER:.*]], label %[[EXIT:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[P:%.*]] = phi i32 [ [[X]], %[[ENTRY]] ], [ [[IV]], %[[LOOP_HEADER]] ], [ [[IV]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[U:%.*]] = icmp ult i32 [[P]], 1235 +; CHECK-NEXT: ret i1 [[U]] +; +entry: + br i1 %pre.c, label %loop.header, label %exit + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %p = phi i32 [ %x, %entry ], [ %iv, %loop.header ], [ %iv, %loop.latch ] + %u = icmp ult i32 %p, 1235 + ret i1 %u +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_successors_swapped(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_successors_swapped( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_LATCH]], label %[[EXIT:.*]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[U:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[U]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %loop.latch, label %exit + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %u = icmp ult i32 %iv, 1235 + ret i1 %u +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_const_compare_not_known( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[U:%.*]] = icmp ult i32 [[IV]], 1234 +; CHECK-NEXT: ret i1 [[U]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %u = icmp ult i32 %iv, 1234 + ret i1 %u +} + +define i1 @multi_exiting_loop_eq_same_unique_exit_var_compare_known(ptr %s, i32 %N) { +; CHECK-LABEL: define i1 @multi_exiting_loop_eq_same_unique_exit_var_compare_known( +; CHECK-SAME: ptr [[S:%.*]], i32 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ule i32 [[IV]], [[N]] +; CHECK-NEXT: ret i1 [[T]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp eq i32 %iv, %N + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ule i32 %iv, %N + ret i1 %t +} + +define i1 @multi_exiting_loop_ne_same_unique_exit_const_compare_known(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_ne_same_unique_exit_const_compare_known( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp ne i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[LOOP_LATCH]], label %[[EXIT:.*]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[T:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[T]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp ne i32 %iv, 1234 + br i1 %exitcond.not, label %loop.latch, label %exit + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %t = icmp ult i32 %iv, 1235 + ret i1 %t +} + +define i1 @multi_exiting_loop_ne_same_unique_exit_successors_swapped(ptr %s) { +; CHECK-LABEL: define i1 @multi_exiting_loop_ne_same_unique_exit_successors_swapped( +; CHECK-SAME: ptr [[S:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp ne i32 [[IV]], 1234 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[EXIT:.*]], label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[S]], i32 [[IV]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[LATCH_C:%.*]] = icmp ult i8 [[TMP0]], 10 +; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 +; CHECK-NEXT: br i1 [[LATCH_C]], label %[[LOOP_HEADER]], label %[[EXIT]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[U:%.*]] = icmp ult i32 [[IV]], 1235 +; CHECK-NEXT: ret i1 [[U]] +; +entry: + br label %loop.header + +loop.header: + %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ] + %exitcond.not = icmp ne i32 %iv, 1234 + br i1 %exitcond.not, label %exit, label %loop.latch + +loop.latch: + %arrayidx = getelementptr inbounds i8, ptr %s, i32 %iv + %0 = load i8, ptr %arrayidx, align 1 + %latch.c = icmp ult i8 %0, 10 + %iv.next = add nuw nsw i32 %iv, 1 + br i1 %latch.c, label %loop.header, label %exit + +exit: + %u = icmp ult i32 %iv, 1235 + ret i1 %u +} diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll new file mode 100644 index 0000000..f7e21cd --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll @@ -0,0 +1,232 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s --check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s --check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +; Split variadic functions into two functions: +; - one equivalent to the original, same symbol etc +; - one implementing the contents of the original but taking a valist +; IR here is applicable to any target that uses a ptr for valist +; +; Defines a function with each linkage (in the order of the llvm documentation). +; If split applies it does the same transform to each. +; Whether split applies depends on whether the ABI is being changed or not - e.g. a weak +; function is not normally useful to split as the contents cannot be called from elsewhere. +; If the ABI is being rewritten then the function is still converted. Call sites tested elsewhere. + +; Update test checks doesn't emit checks for declares + +declare void @sink_valist(ptr) +declare void @llvm.va_start(ptr) +declare void @llvm.va_end(ptr) + +declare void @decl_simple(...) +define void @defn_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_simple(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: call void @defn_simple.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for private +define private void @defn_private_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_private_simple(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: call void @defn_private_simple.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_private_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for internal +define internal void @defn_internal_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_internal_simple(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: call void @defn_internal_simple.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_internal_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for available_externally +define available_externally void @available_externally_simple(...) { +; OPT-LABEL: define {{[^@]+}}@available_externally_simple(...) { +; OPT-NEXT: %va = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va) +; OPT-NEXT: call void @sink_valist(ptr %va) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@available_externally_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for linkonce +define linkonce void @defn_linkonce_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_linkonce_simple(...) { +; OPT-NEXT: %va = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va) +; OPT-NEXT: call void @sink_valist(ptr %va) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_linkonce_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for weak +define weak void @defn_weak_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_weak_simple(...) { +; OPT-NEXT: %va = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va) +; OPT-NEXT: call void @sink_valist(ptr %va) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_weak_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; common is not applicable to functions +; appending is not applicable to functions + +declare extern_weak void @decl_extern_weak_simple(...) +; no define for extern_weak + +; no declare for linkonce_odr +define linkonce_odr void @defn_linkonce_odr_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(...) { +; OPT-NEXT: %va = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va) +; OPT-NEXT: call void @sink_valist(ptr %va) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +; no declare for weak_odr +define weak_odr void @defn_weak_odr_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_weak_odr_simple(...) { +; OPT-NEXT: %va = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va) +; OPT-NEXT: call void @sink_valist(ptr %va) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_weak_odr_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} + +declare external void @decl_external_simple(...) +define external void @defn_external_simple(...) { +; OPT-LABEL: define {{[^@]+}}@defn_external_simple(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: call void @defn_external_simple.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret void +; +; ABI-LABEL: define {{[^@]+}}@defn_external_simple(ptr %varargs) { +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: call void @sink_valist(ptr %va) +; ABI-NEXT: ret void +; + %va = alloca ptr, align 4 + call void @llvm.va_start(ptr %va) + call void @sink_valist(ptr %va) + call void @llvm.va_end(ptr %va) + ret void +} diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll new file mode 100644 index 0000000..9a86540 --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll @@ -0,0 +1,214 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s --check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s --check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +; Examples are variadic functions that return the first or the second of an int and a double +; Split the functions into an internal equivalent that takes a va_list and a ABI preserving wrapper + +define i32 @variadic_int_double_get_firstz(...) { +; OPT-LABEL: define {{[^@]+}}@variadic_int_double_get_firstz(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: %1 = call i32 @variadic_int_double_get_firstz.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret i32 %1 +; +; ABI-LABEL: define {{[^@]+}}@variadic_int_double_get_firstz(ptr %varargs) { +; ABI-NEXT: entry: +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: %argp.cur = load ptr, ptr %va, align 4 +; ABI-NEXT: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 +; ABI-NEXT: store ptr %argp.next, ptr %va, align 4 +; ABI-NEXT: %0 = load i32, ptr %argp.cur, align 4 +; ABI-NEXT: ret i32 %0 +; +entry: + %va = alloca ptr, align 4 + call void @llvm.va_start.p0(ptr nonnull %va) + %argp.cur = load ptr, ptr %va, align 4 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 + store ptr %argp.next, ptr %va, align 4 + %0 = load i32, ptr %argp.cur, align 4 + call void @llvm.va_end.p0(ptr %va) + ret i32 %0 +} + +; CHECK-LABEL: define i32 @variadic_int_double_get_firstz(...) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va_list = alloca ptr, align 4 +; CHECK-NEXT: call void @llvm.va_start.p0(ptr %va_list) +; CHECK-NEXT: %0 = tail call i32 @variadic_int_double_get_firstz.valist(ptr %va_list) +; CHECK-NEXT: ret i32 %0 +; CHECK-NEXT: } + +; CHECK-LABEL: define internal i32 @variadic_int_double_get_firstz.valist(ptr noalias %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va = alloca ptr, align 4 +; CHECK-NEXT: store ptr %varargs, ptr %va, align 4 +; CHECK-NEXT: %argp.cur = load ptr, ptr %va, align 4 +; CHECK-NEXT: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 +; CHECK-NEXT: store ptr %argp.next, ptr %va, align 4 +; CHECK-NEXT: %0 = load i32, ptr %argp.cur, align 4 +; CHECK-NEXT: ret i32 %0 +; CHECK-NEXT: } + +define double @variadic_int_double_get_secondz(...) { +; OPT-LABEL: define {{[^@]+}}@variadic_int_double_get_secondz(...) { +; OPT-NEXT: entry: +; OPT-NEXT: %va_start = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr %va_start) +; OPT-NEXT: call void @llvm.va_start.p0(ptr %va_start) +; OPT-NEXT: %0 = load ptr, ptr %va_start, align 4 +; OPT-NEXT: %1 = call double @variadic_int_double_get_secondz.valist(ptr %0) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr %va_start) +; OPT-NEXT: ret double %1 +; +; ABI-LABEL: define {{[^@]+}}@variadic_int_double_get_secondz(ptr %varargs) { +; ABI-NEXT: entry: +; ABI-NEXT: %va = alloca ptr, align 4 +; ABI-NEXT: store ptr %varargs, ptr %va, align 4 +; ABI-NEXT: %argp.cur = load ptr, ptr %va, align 4 +; ABI-NEXT: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 +; ABI-NEXT: %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12 +; ABI-NEXT: store ptr %argp.next2, ptr %va, align 4 +; ABI-NEXT: %0 = load double, ptr %argp.next, align 4 +; ABI-NEXT: ret double %0 +; +entry: + %va = alloca ptr, align 4 + call void @llvm.va_start.p0(ptr nonnull %va) + %argp.cur = load ptr, ptr %va, align 4 + %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 + %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12 + store ptr %argp.next2, ptr %va, align 4 + %0 = load double, ptr %argp.next, align 4 + call void @llvm.va_end.p0(ptr %va) + ret double %0 +} + +; CHECK-LABEL: define double @variadic_int_double_get_secondz(...) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va_list = alloca ptr, align 4 +; CHECK-NEXT: call void @llvm.va_start.p0(ptr %va_list) +; CHECK-NEXT: %0 = tail call double @variadic_int_double_get_secondz.valist(ptr %va_list) +; CHECK-NEXT: ret double %0 +; CHECK-NEXT: } + +; CHECK-LABEL: define internal double @variadic_int_double_get_secondz.valist(ptr noalias %varargs) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %va = alloca ptr, align 4 +; CHECK-NEXT: store ptr %varargs, ptr %va, align 4 +; CHECK-NEXT: %argp.cur = load ptr, ptr %va, align 4 +; CHECK-NEXT: %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4 +; CHECK-NEXT: %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12 +; CHECK-NEXT: store ptr %argp.next2, ptr %va, align 4 +; CHECK-NEXT: %0 = load double, ptr %argp.next, align 4 +; CHECK-NEXT: ret double %0 +; CHECK-NEXT: } + + +; CHECK-LABEL: @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %variadic_can_get_firstIidEEbT_T0_.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store double %y, ptr %1, align 4 +; CHECK-NEXT: %call = call i32 @variadic_int_double_get_firstz.valist(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: %cmp.i = icmp eq i32 %call, %x +; CHECK-NEXT: ret i1 %cmp.i +; CHECK-NEXT: } + +define zeroext i1 @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) { +; OPT-LABEL: define {{[^@]+}}@variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) { +; OPT-NEXT: entry: +; OPT-NEXT: %vararg_buffer = alloca %variadic_can_get_firstIidEEbT_T0_.vararg, align 16 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; OPT-NEXT: %0 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; OPT-NEXT: store i32 %x, ptr %0, align 4 +; OPT-NEXT: %1 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 2 +; OPT-NEXT: store double %y, ptr %1, align 8 +; OPT-NEXT: %call = call i32 @variadic_int_double_get_firstz.valist(ptr %vararg_buffer) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; OPT-NEXT: %cmp.i = icmp eq i32 %call, %x +; OPT-NEXT: ret i1 %cmp.i +; +; ABI-LABEL: define {{[^@]+}}@variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) { +; ABI-NEXT: entry: +; ABI-NEXT: %vararg_buffer = alloca %variadic_can_get_firstIidEEbT_T0_.vararg, align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; ABI-NEXT: %0 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; ABI-NEXT: store i32 %x, ptr %0, align 4 +; ABI-NEXT: %1 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 2 +; ABI-NEXT: store double %y, ptr %1, align 8 +; ABI-NEXT: %call = call i32 @variadic_int_double_get_firstz(ptr %vararg_buffer) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; ABI-NEXT: %cmp.i = icmp eq i32 %call, %x +; ABI-NEXT: ret i1 %cmp.i +; +entry: + %call = call i32 (...) @variadic_int_double_get_firstz(i32 %x, double %y) + %cmp.i = icmp eq i32 %call, %x + ret i1 %cmp.i +} + +; CHECK-LABEL: @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %vararg_buffer = alloca %variadic_can_get_secondIidEEbT_T0_.vararg, align 16 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: %0 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; CHECK-NEXT: store i32 %x, ptr %0, align 4 +; CHECK-NEXT: %1 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1 +; CHECK-NEXT: store double %y, ptr %1, align 4 +; CHECK-NEXT: %call = call double @variadic_int_double_get_secondz.valist(ptr %vararg_buffer) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer) +; CHECK-NEXT: %cmp.i = fcmp oeq double %call, %y +; CHECK-NEXT: ret i1 %cmp.i +; CHECK-NEXT: } + +define zeroext i1 @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) { +; OPT-LABEL: define {{[^@]+}}@variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) { +; OPT-NEXT: entry: +; OPT-NEXT: %vararg_buffer = alloca %variadic_can_get_secondIidEEbT_T0_.vararg, align 16 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; OPT-NEXT: %0 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; OPT-NEXT: store i32 %x, ptr %0, align 4 +; OPT-NEXT: %1 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 2 +; OPT-NEXT: store double %y, ptr %1, align 8 +; OPT-NEXT: %call = call double @variadic_int_double_get_secondz.valist(ptr %vararg_buffer) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; OPT-NEXT: %cmp.i = fcmp oeq double %call, %y +; OPT-NEXT: ret i1 %cmp.i +; +; ABI-LABEL: define {{[^@]+}}@variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) { +; ABI-NEXT: entry: +; ABI-NEXT: %vararg_buffer = alloca %variadic_can_get_secondIidEEbT_T0_.vararg, align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer) +; ABI-NEXT: %0 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0 +; ABI-NEXT: store i32 %x, ptr %0, align 4 +; ABI-NEXT: %1 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 2 +; ABI-NEXT: store double %y, ptr %1, align 8 +; ABI-NEXT: %call = call double @variadic_int_double_get_secondz(ptr %vararg_buffer) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer) +; ABI-NEXT: %cmp.i = fcmp oeq double %call, %y +; ABI-NEXT: ret i1 %cmp.i +; +entry: + %call = call double (...) @variadic_int_double_get_secondz(i32 %x, double %y) + %cmp.i = fcmp oeq double %call, %y + ret i1 %cmp.i +} + +; Declaration unchanged +; CHECK: declare void @variadic_without_callers(...) +declare void @variadic_without_callers(...) + +declare void @llvm.va_start.p0(ptr) +declare void @llvm.va_end.p0(ptr) diff --git a/llvm/test/Transforms/ExpandVariadics/indirect-calls.ll b/llvm/test/Transforms/ExpandVariadics/indirect-calls.ll new file mode 100644 index 0000000..de04c72 --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/indirect-calls.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +declare void @vararg(...) +@vararg_ptr = hidden global ptr @vararg, align 4 + +%struct.libcS = type { i8, i16, i32, i32, float, double } + +define hidden void @fptr_single_i32(i32 noundef %x) { +; OPT-LABEL: @fptr_single_i32( +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = load volatile ptr, ptr @vararg_ptr, align 4 +; OPT-NEXT: tail call void (...) [[TMP0]](i32 noundef [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @fptr_single_i32( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[FPTR_SINGLE_I32_VARARG:%.*]], align 16 +; ABI-NEXT: [[TMP0:%.*]] = load volatile ptr, ptr @vararg_ptr, align 4 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[FPTR_SINGLE_I32_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void [[TMP0]](ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr @vararg_ptr, align 4 + tail call void (...) %0(i32 noundef %x) + ret void +} + +define hidden void @fptr_libcS(ptr noundef byval(%struct.libcS) align 8 %x) { +; OPT-LABEL: @fptr_libcS( +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = load volatile ptr, ptr @vararg_ptr, align 4 +; OPT-NEXT: tail call void (...) [[TMP0]](ptr noundef nonnull byval([[STRUCT_LIBCS:%.*]]) align 8 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @fptr_libcS( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[FPTR_LIBCS_VARARG:%.*]], align 16 +; ABI-NEXT: [[TMP0:%.*]] = load volatile ptr, ptr @vararg_ptr, align 4 +; ABI-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[INDIRECTALLOCA]], ptr [[X:%.*]], i64 24, i1 false) +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[FPTR_LIBCS_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void [[TMP0]](ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + %0 = load volatile ptr, ptr @vararg_ptr, align 4 + tail call void (...) %0(ptr noundef nonnull byval(%struct.libcS) align 8 %x) + ret void +} diff --git a/llvm/test/Transforms/ExpandVariadics/intrinsics.ll b/llvm/test/Transforms/ExpandVariadics/intrinsics.ll new file mode 100644 index 0000000..1782c92 --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/intrinsics.ll @@ -0,0 +1,120 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=CHECK,OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -check-prefixes=CHECK,ABI +; REQUIRES: webassembly-registered-target + +declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) + +declare void @llvm.va_copy.p0(ptr, ptr) + +declare void @valist(ptr noundef) + +declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) + +declare void @llvm.va_start.p0(ptr) + +declare void @llvm.va_end.p0(ptr) + + +define void @start_once(...) { +; OPT-LABEL: @start_once( +; OPT-NEXT: entry: +; OPT-NEXT: [[VA_START:%.*]] = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VA_START]]) +; OPT-NEXT: call void @llvm.va_start.p0(ptr [[VA_START]]) +; OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VA_START]], align 4 +; OPT-NEXT: call void @start_once.valist(ptr [[TMP0]]) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VA_START]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @start_once( +; ABI-NEXT: entry: +; ABI-NEXT: [[S:%.*]] = alloca ptr, align 4 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[S]]) +; ABI-NEXT: store ptr [[VARARGS:%.*]], ptr [[S]], align 4 +; ABI-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S]], align 4 +; ABI-NEXT: call void @valist(ptr noundef [[TMP0]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[S]]) +; ABI-NEXT: ret void +; +entry: + %s = alloca ptr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) + call void @llvm.va_start.p0(ptr nonnull %s) + %0 = load ptr, ptr %s, align 4 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) + ret void +} + + +define void @start_twice(...) { +; OPT-LABEL: @start_twice( +; OPT-NEXT: entry: +; OPT-NEXT: [[VA_START:%.*]] = alloca ptr, align 4 +; OPT-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VA_START]]) +; OPT-NEXT: call void @llvm.va_start.p0(ptr [[VA_START]]) +; OPT-NEXT: [[TMP0:%.*]] = load ptr, ptr [[VA_START]], align 4 +; OPT-NEXT: call void @start_twice.valist(ptr [[TMP0]]) +; OPT-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VA_START]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @start_twice( +; ABI-NEXT: entry: +; ABI-NEXT: [[S0:%.*]] = alloca ptr, align 4 +; ABI-NEXT: [[S1:%.*]] = alloca ptr, align 4 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[S0]]) +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[S1]]) +; ABI-NEXT: store ptr [[VARARGS:%.*]], ptr [[S0]], align 4 +; ABI-NEXT: [[TMP0:%.*]] = load ptr, ptr [[S0]], align 4 +; ABI-NEXT: call void @valist(ptr noundef [[TMP0]]) +; ABI-NEXT: store ptr [[VARARGS]], ptr [[S1]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = load ptr, ptr [[S1]], align 4 +; ABI-NEXT: call void @valist(ptr noundef [[TMP1]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[S1]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[S0]]) +; ABI-NEXT: ret void +; +entry: + %s0 = alloca ptr, align 4 + %s1 = alloca ptr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) + call void @llvm.va_start.p0(ptr nonnull %s0) + %0 = load ptr, ptr %s0, align 4 + call void @valist(ptr noundef %0) + call void @llvm.va_end.p0(ptr %s0) + call void @llvm.va_start.p0(ptr nonnull %s1) + %1 = load ptr, ptr %s1, align 4 + call void @valist(ptr noundef %1) + call void @llvm.va_end.p0(ptr %s1) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) + ret void +} + +define void @copy(ptr noundef %va) { +; CHECK-LABEL: @copy( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VA_ADDR:%.*]] = alloca ptr, align 4 +; CHECK-NEXT: [[CP:%.*]] = alloca ptr, align 4 +; CHECK-NEXT: store ptr [[VA:%.*]], ptr [[VA_ADDR]], align 4 +; CHECK-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[CP]]) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr [[CP]], ptr [[VA_ADDR]], i32 4, i1 false) +; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[CP]], align 4 +; CHECK-NEXT: call void @valist(ptr noundef [[TMP0]]) +; CHECK-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[CP]]) +; CHECK-NEXT: ret void +; +entry: + %va.addr = alloca ptr, align 4 + %cp = alloca ptr, align 4 + store ptr %va, ptr %va.addr, align 4 + call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) + call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr) + %0 = load ptr, ptr %cp, align 4 + call void @valist(ptr noundef %0) + call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) + ret void +} diff --git a/llvm/test/Transforms/ExpandVariadics/invoke.ll b/llvm/test/Transforms/ExpandVariadics/invoke.ll new file mode 100644 index 0000000..ced2edf --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/invoke.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=CHECK +; RUN: not --crash opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s 2>&1 | FileCheck %s -check-prefixes=ERROR +; REQUIRES: webassembly-registered-target +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" + +; ERROR: LLVM ERROR: Cannot lower callbase instruction + +@_ZTIi = external constant ptr + +; Function Attrs: mustprogress +define hidden void @test0(i32 noundef %x) #0 personality ptr @__gxx_wasm_personality_v0 { +; CHECK-LABEL: @test0( +; CHECK-NEXT: entry: +; CHECK-NEXT: invoke void (...) @may_throw(i32 noundef [[X:%.*]]) +; CHECK-NEXT: to label [[TRY_CONT:%.*]] unwind label [[CATCH_DISPATCH:%.*]] +; CHECK: catch.dispatch: +; CHECK-NEXT: [[TMP0:%.*]] = catchswitch within none [label %catch.start] unwind to caller +; CHECK: catch.start: +; CHECK-NEXT: [[TMP1:%.*]] = catchpad within [[TMP0]] [ptr @_ZTIi] +; CHECK-NEXT: [[TMP2:%.*]] = tail call ptr @llvm.wasm.get.exception(token [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = tail call i32 @llvm.wasm.get.ehselector(token [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) +; CHECK-NEXT: [[MATCHES:%.*]] = icmp eq i32 [[TMP3]], [[TMP4]] +; CHECK-NEXT: br i1 [[MATCHES]], label [[CATCH:%.*]], label [[RETHROW:%.*]] +; CHECK: catch: +; CHECK-NEXT: [[TMP5:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP2]]) [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: call void (...) @dont_throw(i32 noundef [[X]]) [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: call void @__cxa_end_catch() [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: catchret from [[TMP1]] to label [[TRY_CONT]] +; CHECK: rethrow: +; CHECK-NEXT: call void @llvm.wasm.rethrow() [ "funclet"(token [[TMP1]]) ] +; CHECK-NEXT: unreachable +; CHECK: try.cont: +; CHECK-NEXT: ret void +; +entry: + invoke void (...) @may_throw(i32 noundef %x) + to label %try.cont unwind label %catch.dispatch + +catch.dispatch: ; preds = %entry + %0 = catchswitch within none [label %catch.start] unwind to caller + +catch.start: ; preds = %catch.dispatch + %1 = catchpad within %0 [ptr @_ZTIi] + %2 = tail call ptr @llvm.wasm.get.exception(token %1) + %3 = tail call i32 @llvm.wasm.get.ehselector(token %1) + %4 = tail call i32 @llvm.eh.typeid.for.p0(ptr nonnull @_ZTIi) + %matches = icmp eq i32 %3, %4 + br i1 %matches, label %catch, label %rethrow + +catch: ; preds = %catch.start + %5 = call ptr @__cxa_begin_catch(ptr %2) #6 [ "funclet"(token %1) ] + call void (...) @dont_throw(i32 noundef %x) #6 [ "funclet"(token %1) ] + call void @__cxa_end_catch() #6 [ "funclet"(token %1) ] + catchret from %1 to label %try.cont + +rethrow: ; preds = %catch.start + call void @llvm.wasm.rethrow() #5 [ "funclet"(token %1) ] + unreachable + +try.cont: ; preds = %entry, %catch + ret void +} + +declare void @may_throw(...) + +declare i32 @__gxx_wasm_personality_v0(...) + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn +declare ptr @llvm.wasm.get.exception(token) + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn +declare i32 @llvm.wasm.get.ehselector(token) + +; Function Attrs: nofree nosync nounwind memory(none) +declare i32 @llvm.eh.typeid.for.p0(ptr) + +declare ptr @__cxa_begin_catch(ptr) + +; Function Attrs: nounwind +declare void @dont_throw(...) + +declare void @__cxa_end_catch() + +; Function Attrs: noreturn +declare void @llvm.wasm.rethrow() + + diff --git a/llvm/test/Transforms/ExpandVariadics/pass-byval-byref.ll b/llvm/test/Transforms/ExpandVariadics/pass-byval-byref.ll new file mode 100644 index 0000000..85fefda --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/pass-byval-byref.ll @@ -0,0 +1,153 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +; CHECK: @sink +declare void @sink(...) + + +define void @pass_byval(ptr byval(i32) %b) { +; OPT-LABEL: @pass_byval( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(ptr byval(i32) [[B:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_byval( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_BYVAL_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_BYVAL_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[TMP0]], ptr [[B:%.*]], i64 4, i1 false) +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(ptr byval(i32) %b) + ret void +} + +%struct.libcS = type { i8, i16, i32, i32, float, double } + +define void @i32_libcS_byval(i32 %x, ptr noundef byval(%struct.libcS) align 8 %y) { +; OPT-LABEL: @i32_libcS_byval( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[X:%.*]], ptr byval([[STRUCT_LIBCS:%.*]]) align 8 [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @i32_libcS_byval( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[I32_LIBCS_BYVAL_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[INDIRECTALLOCA]], ptr [[Y:%.*]], i64 24, i1 false) +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[I32_LIBCS_BYVAL_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[I32_LIBCS_BYVAL_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %x, ptr byval(%struct.libcS) align 8 %y) + ret void +} + +define void @libcS_i32_byval(ptr byval(%struct.libcS) align 8 %x, i32 %y) { +; OPT-LABEL: @libcS_i32_byval( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(ptr byval([[STRUCT_LIBCS:%.*]]) align 8 [[X:%.*]], i32 [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @libcS_i32_byval( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[LIBCS_I32_BYVAL_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[INDIRECTALLOCA]], ptr [[X:%.*]], i64 24, i1 false) +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[LIBCS_I32_BYVAL_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[LIBCS_I32_BYVAL_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i32 [[Y:%.*]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(ptr byval(%struct.libcS) align 8 %x, i32 %y) + ret void +} + + +define void @pass_byref(ptr byref(i32) %b) { +; OPT-LABEL: @pass_byref( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(ptr byref(i32) [[B:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_byref( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_BYREF_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_BYREF_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store ptr [[B:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(ptr byref(i32) %b) + ret void +} + +define void @i32_libcS_byref(i32 %x, ptr noundef byref(%struct.libcS) align 8 %y) { +; OPT-LABEL: @i32_libcS_byref( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[X:%.*]], ptr byref([[STRUCT_LIBCS:%.*]]) align 8 [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @i32_libcS_byref( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[I32_LIBCS_BYREF_VARARG:%.*]], align 16 +; ABI-NEXT: store ptr [[Y:%.*]], ptr [[INDIRECTALLOCA]], align 4 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[I32_LIBCS_BYREF_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[I32_LIBCS_BYREF_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %x, ptr byref(%struct.libcS) align 8 %y) + ret void +} + +define void @libcS_i32_byref(ptr byref(%struct.libcS) align 8 %x, i32 %y) { +; OPT-LABEL: @libcS_i32_byref( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(ptr byref([[STRUCT_LIBCS:%.*]]) align 8 [[X:%.*]], i32 [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @libcS_i32_byref( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[LIBCS_I32_BYREF_VARARG:%.*]], align 16 +; ABI-NEXT: store ptr [[X:%.*]], ptr [[INDIRECTALLOCA]], align 4 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[LIBCS_I32_BYREF_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[LIBCS_I32_BYREF_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i32 [[Y:%.*]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(ptr byref(%struct.libcS) align 8 %x, i32 %y) + ret void +} diff --git a/llvm/test/Transforms/ExpandVariadics/pass-indirect.ll b/llvm/test/Transforms/ExpandVariadics/pass-indirect.ll new file mode 100644 index 0000000..8dcbb86 --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/pass-indirect.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +; CHECK: @sink +declare void @sink(...) + +%struct.libcS = type { i8, i16, i32, i32, float, double } + +define void @i32_libcS(i32 %x, %struct.libcS %y) { +; OPT-LABEL: @i32_libcS( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[X:%.*]], [[STRUCT_LIBCS:%.*]] [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @i32_libcS( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[I32_LIBCS_VARARG:%.*]], align 16 +; ABI-NEXT: store [[STRUCT_LIBCS]] [[Y:%.*]], ptr [[INDIRECTALLOCA]], align 8 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[I32_LIBCS_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[I32_LIBCS_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %x, %struct.libcS %y) + ret void +} + +define void @libcS_i32(%struct.libcS %x, i32 %y) { +; OPT-LABEL: @libcS_i32( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink([[STRUCT_LIBCS:%.*]] [[X:%.*]], i32 [[Y:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @libcS_i32( +; ABI-NEXT: entry: +; ABI-NEXT: [[INDIRECTALLOCA:%.*]] = alloca [[STRUCT_LIBCS:%.*]], align 8 +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[LIBCS_I32_VARARG:%.*]], align 16 +; ABI-NEXT: store [[STRUCT_LIBCS]] [[X:%.*]], ptr [[INDIRECTALLOCA]], align 8 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[LIBCS_I32_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store ptr [[INDIRECTALLOCA]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[LIBCS_I32_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i32 [[Y:%.*]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(%struct.libcS %x, i32 %y) + ret void +} diff --git a/llvm/test/Transforms/ExpandVariadics/pass-integers.ll b/llvm/test/Transforms/ExpandVariadics/pass-integers.ll new file mode 100644 index 0000000..a1cb681 --- /dev/null +++ b/llvm/test/Transforms/ExpandVariadics/pass-integers.ll @@ -0,0 +1,345 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefixes=OPT +; RUN: opt -mtriple=wasm32-unknown-unknown -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s -check-prefixes=ABI +; REQUIRES: webassembly-registered-target + +; Wasm passes struct {char} as an i8 so can check the varargs passing works on integers smaller than the slot size + +declare void @sink(...) + + +define void @pass_nothing() { +; OPT-LABEL: @pass_nothing( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink() +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_nothing( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_NOTHING_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink() + ret void +} + +define void @pass_s1(i8 %x) { +; OPT-LABEL: @pass_s1( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i8 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_s1( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_S1_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 1, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_S1_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i8 [[X:%.*]], ptr [[TMP0]], align 1 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 1, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i8 %x) + ret void +} + +define void @pass_s2(i16 %x) { +; OPT-LABEL: @pass_s2( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i16 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_s2( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_S2_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 2, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_S2_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i16 [[X:%.*]], ptr [[TMP0]], align 2 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 2, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i16 %x) + ret void +} + +define void @pass_s3(i32 %x) { +; OPT-LABEL: @pass_s3( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_s3( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_S3_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_S3_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 4, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %x) + ret void +} + +define void @pass_s4(i64 %x) { +; OPT-LABEL: @pass_s4( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i64 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_s4( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_S4_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_S4_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i64 [[X:%.*]], ptr [[TMP0]], align 8 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i64 %x) + ret void +} + +define void @pass_s5(<4 x i32> %x) { +; OPT-LABEL: @pass_s5( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(<4 x i32> [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_s5( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_S5_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_S5_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store <4 x i32> [[X:%.*]], ptr [[TMP0]], align 16 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(<4 x i32> %x) + ret void +} + +define void @pass_int_s1(i32 %i, i8 %x) { +; OPT-LABEL: @pass_int_s1( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], i8 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_int_s1( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_INT_S1_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 5, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_INT_S1_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_INT_S1_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i8 [[X:%.*]], ptr [[TMP1]], align 1 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 5, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, i8 %x) + ret void +} + +define void @pass_int_s2(i32 %i, i16 %x) { +; OPT-LABEL: @pass_int_s2( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], i16 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_int_s2( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_INT_S2_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 6, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_INT_S2_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_INT_S2_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i16 [[X:%.*]], ptr [[TMP1]], align 2 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 6, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, i16 %x) + ret void +} + +define void @pass_int_s3(i32 %i, i32 %x) { +; OPT-LABEL: @pass_int_s3( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], i32 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_int_s3( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_INT_S3_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_INT_S3_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_INT_S3_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i32 [[X:%.*]], ptr [[TMP1]], align 4 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, i32 %x) + ret void +} + +define void @pass_int_s4(i32 %i, i64 %x) { +; OPT-LABEL: @pass_int_s4( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], i64 [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_int_s4( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_INT_S4_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_INT_S4_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_INT_S4_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; ABI-NEXT: store i64 [[X:%.*]], ptr [[TMP1]], align 8 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, i64 %x) + ret void +} + +define void @pass_int_s5(i32 %i, <4 x i32> %x) { +; OPT-LABEL: @pass_int_s5( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], <4 x i32> [[X:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_int_s5( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_INT_S5_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_INT_S5_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_INT_S5_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; ABI-NEXT: store <4 x i32> [[X:%.*]], ptr [[TMP1]], align 16 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, <4 x i32> %x) + ret void +} + +define void @pass_asc(i8 %x1, i16 %x2, i32 %x3, i64 %x4, <4 x i32> %x5) { +; OPT-LABEL: @pass_asc( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i8 [[X1:%.*]], i16 [[X2:%.*]], i32 [[X3:%.*]], i64 [[X4:%.*]], <4 x i32> [[X5:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_asc( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_ASC_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 48, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_ASC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i8 [[X1:%.*]], ptr [[TMP0]], align 1 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_ASC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; ABI-NEXT: store i16 [[X2:%.*]], ptr [[TMP1]], align 2 +; ABI-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PASS_ASC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 4 +; ABI-NEXT: store i32 [[X3:%.*]], ptr [[TMP2]], align 4 +; ABI-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PASS_ASC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 6 +; ABI-NEXT: store i64 [[X4:%.*]], ptr [[TMP3]], align 8 +; ABI-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PASS_ASC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 8 +; ABI-NEXT: store <4 x i32> [[X5:%.*]], ptr [[TMP4]], align 16 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 48, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i8 %x1, i16 %x2, i32 %x3, i64 %x4, <4 x i32> %x5) + ret void +} + +define void @pass_dsc(<4 x i32> %x0, i64 %x1, i32 %x2, i16 %x3, i8 %x4) { +; OPT-LABEL: @pass_dsc( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(<4 x i32> [[X0:%.*]], i64 [[X1:%.*]], i32 [[X2:%.*]], i16 [[X3:%.*]], i8 [[X4:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_dsc( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_DSC_VARARG:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 33, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_DSC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store <4 x i32> [[X0:%.*]], ptr [[TMP0]], align 16 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_DSC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i64 [[X1:%.*]], ptr [[TMP1]], align 8 +; ABI-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PASS_DSC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 2 +; ABI-NEXT: store i32 [[X2:%.*]], ptr [[TMP2]], align 4 +; ABI-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PASS_DSC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 3 +; ABI-NEXT: store i16 [[X3:%.*]], ptr [[TMP3]], align 2 +; ABI-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PASS_DSC_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 5 +; ABI-NEXT: store i8 [[X4:%.*]], ptr [[TMP4]], align 1 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 33, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(<4 x i32> %x0, i64 %x1, i32 %x2, i16 %x3, i8 %x4) + ret void +} + +define void @pass_multiple(i32 %i, i8 %x1, i16 %x2, i32 %x3, i64 %x4, <4 x i32> %x5) { +; OPT-LABEL: @pass_multiple( +; OPT-NEXT: entry: +; OPT-NEXT: tail call void (...) @sink(i32 [[I:%.*]], i16 [[X2:%.*]], i64 [[X4:%.*]]) +; OPT-NEXT: tail call void (...) @sink(i32 [[I]], i8 [[X1:%.*]], i32 [[X3:%.*]], <4 x i32> [[X5:%.*]]) +; OPT-NEXT: ret void +; +; ABI-LABEL: @pass_multiple( +; ABI-NEXT: entry: +; ABI-NEXT: [[VARARG_BUFFER:%.*]] = alloca [[PASS_MULTIPLE_VARARG:%.*]], align 16 +; ABI-NEXT: [[VARARG_BUFFER1:%.*]] = alloca [[PASS_MULTIPLE_VARARG_0:%.*]], align 16 +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: [[TMP0:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I:%.*]], ptr [[TMP0]], align 4 +; ABI-NEXT: [[TMP1:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 1 +; ABI-NEXT: store i16 [[X2:%.*]], ptr [[TMP1]], align 2 +; ABI-NEXT: [[TMP2:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG]], ptr [[VARARG_BUFFER]], i32 0, i32 3 +; ABI-NEXT: store i64 [[X4:%.*]], ptr [[TMP2]], align 8 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 16, ptr [[VARARG_BUFFER]]) +; ABI-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr [[VARARG_BUFFER1]]) +; ABI-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG_0]], ptr [[VARARG_BUFFER1]], i32 0, i32 0 +; ABI-NEXT: store i32 [[I]], ptr [[TMP3]], align 4 +; ABI-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG_0]], ptr [[VARARG_BUFFER1]], i32 0, i32 1 +; ABI-NEXT: store i8 [[X1:%.*]], ptr [[TMP4]], align 1 +; ABI-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG_0]], ptr [[VARARG_BUFFER1]], i32 0, i32 3 +; ABI-NEXT: store i32 [[X3:%.*]], ptr [[TMP5]], align 4 +; ABI-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[PASS_MULTIPLE_VARARG_0]], ptr [[VARARG_BUFFER1]], i32 0, i32 5 +; ABI-NEXT: store <4 x i32> [[X5:%.*]], ptr [[TMP6]], align 16 +; ABI-NEXT: call void @sink(ptr [[VARARG_BUFFER1]]) +; ABI-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr [[VARARG_BUFFER1]]) +; ABI-NEXT: ret void +; +entry: + tail call void (...) @sink(i32 %i, i16 %x2, i64 %x4) + tail call void (...) @sink(i32 %i, i8 %x1, i32 %x3, <4 x i32> %x5) + ret void +} diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll index ae503bf..e103fe9 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sge-to-icmp-sle.ll @@ -98,8 +98,7 @@ declare i8 @gen8() define i1 @c0() { ; CHECK-LABEL: @c0( ; CHECK-NEXT: [[X:%.*]] = call i8 @gen8() -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[X]], 3 -; CHECK-NEXT: [[RET:%.*]] = icmp sge i8 [[X]], [[TMP0]] +; CHECK-NEXT: [[RET:%.*]] = icmp sgt i8 [[X]], -1 ; CHECK-NEXT: ret i1 [[RET]] ; %x = call i8 @gen8() diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll index d1dd411..bbd733e 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sgt-to-icmp-sgt.ll @@ -125,8 +125,7 @@ define i1 @oneuse0() { define i1 @c0(i8 %x) { ; CHECK-LABEL: @c0( -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[X:%.*]], 3 -; CHECK-NEXT: [[RET:%.*]] = icmp sgt i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = and i8 %x, 3 diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll index 4bed21a..b167c8a 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-sle-to-icmp-sle.ll @@ -113,8 +113,7 @@ define i1 @oneuse0() { define i1 @c0(i8 %x) { ; CHECK-LABEL: @c0( -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[X:%.*]], 3 -; CHECK-NEXT: [[RET:%.*]] = icmp sle i8 [[TMP0]], [[X]] +; CHECK-NEXT: [[RET:%.*]] = icmp sgt i8 [[X:%.*]], -1 ; CHECK-NEXT: ret i1 [[RET]] ; %tmp0 = and i8 %x, 3 diff --git a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll index 8415204..8281502 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-constant-low-bit-mask-and-icmp-slt-to-icmp-sgt.ll @@ -108,8 +108,7 @@ declare i8 @gen8() define i1 @c0() { ; CHECK-LABEL: @c0( ; CHECK-NEXT: [[X:%.*]] = call i8 @gen8() -; CHECK-NEXT: [[TMP0:%.*]] = and i8 [[X]], 3 -; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[X]], [[TMP0]] +; CHECK-NEXT: [[RET:%.*]] = icmp slt i8 [[X]], 0 ; CHECK-NEXT: ret i1 [[RET]] ; %x = call i8 @gen8() diff --git a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll index 8bb7fd0..0aace5f 100644 --- a/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll +++ b/llvm/test/Transforms/InstCombine/icmp-and-lowbit-mask.ll @@ -7,8 +7,8 @@ define i1 @src_is_mask_zext(i16 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_zext( ; CHECK-NEXT: [[M_IN:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = zext i8 [[M_IN]] to i16 -; CHECK-NEXT: [[X:%.*]] = xor i16 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i16 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i16 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i16 %x_in, 123 @@ -83,8 +83,8 @@ define i1 @src_is_mask_and(i8 %x_in, i8 %y, i8 %z) { ; CHECK-NEXT: [[MY:%.*]] = lshr i8 7, [[Y:%.*]] ; CHECK-NEXT: [[MZ:%.*]] = lshr i8 -1, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = and i8 [[MY]], [[MZ]] -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -121,8 +121,8 @@ define i1 @src_is_mask_or(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_or( ; CHECK-NEXT: [[MY:%.*]] = lshr i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = and i8 [[MY]], 7 -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -138,8 +138,8 @@ define i1 @src_is_mask_xor(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_xor( ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[MASK:%.*]] = xor i8 [[Y_M1]], [[Y]] -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -173,8 +173,8 @@ define i1 @src_is_mask_select(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -249,8 +249,8 @@ define i1 @src_is_mask_lshr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = lshr i8 [[SMASK]], [[Z:%.*]] -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -269,8 +269,8 @@ define i1 @src_is_mask_ashr(i8 %x_in, i8 %y, i8 %z, i1 %cond) { ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[SMASK:%.*]] = select i1 [[COND:%.*]], i8 [[YMASK]], i8 15 ; CHECK-NEXT: [[MASK:%.*]] = ashr i8 [[SMASK]], [[Z:%.*]] -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -287,8 +287,8 @@ define i1 @src_is_mask_p2_m1(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_p2_m1( ; CHECK-NEXT: [[P2ORZ:%.*]] = shl i8 2, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = add i8 [[P2ORZ]], -1 -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -304,8 +304,8 @@ define i1 @src_is_mask_umax(i8 %x_in, i8 %y) { ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umax.i8(i8 [[YMASK]], i8 3) -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -324,8 +324,8 @@ define i1 @src_is_mask_umin(i8 %x_in, i8 %y, i8 %z) { ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[ZMASK:%.*]] = lshr i8 15, [[Z:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.umin.i8(i8 [[YMASK]], i8 [[ZMASK]]) -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ugt i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -364,8 +364,8 @@ define i1 @src_is_mask_smax(i8 %x_in, i8 %y) { ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smax.i8(i8 [[YMASK]], i8 -1) -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -383,8 +383,8 @@ define i1 @src_is_mask_smin(i8 %x_in, i8 %y) { ; CHECK-NEXT: [[Y_M1:%.*]] = add i8 [[Y:%.*]], -1 ; CHECK-NEXT: [[YMASK:%.*]] = xor i8 [[Y_M1]], [[Y]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.smin.i8(i8 [[YMASK]], i8 0) -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -401,8 +401,8 @@ define i1 @src_is_mask_bitreverse_not_mask(i8 %x_in, i8 %y) { ; CHECK-LABEL: @src_is_mask_bitreverse_not_mask( ; CHECK-NEXT: [[NMASK:%.*]] = shl nsw i8 -1, [[Y:%.*]] ; CHECK-NEXT: [[MASK:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[NMASK]]) -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[MASK]] +; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[X_IN:%.*]], 123 +; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[TMP1]], [[MASK]] ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -455,9 +455,9 @@ define i1 @src_is_notmask_shl(i8 %x_in, i8 %y, i1 %cond) { define i1 @src_is_notmask_x_xor_neg_x(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_notmask_x_xor_neg_x( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[NEG_Y:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[NOTMASK0:%.*]] = xor i8 [[NEG_Y]], [[Y]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[NOTMASK0]], i8 7 +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[TMP2]], i8 7 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -473,9 +473,9 @@ define i1 @src_is_notmask_x_xor_neg_x(i8 %x_in, i8 %y, i1 %cond) { define i1 @src_is_notmask_x_xor_neg_x_inv(i8 %x_in, i8 %y, i1 %cond) { ; CHECK-LABEL: @src_is_notmask_x_xor_neg_x_inv( ; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[NEG_Y:%.*]] = add i8 [[Y:%.*]], -1 -; CHECK-NEXT: [[NOTMASK0:%.*]] = xor i8 [[NEG_Y]], [[Y]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[NOTMASK0]], i8 7 +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y:%.*]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = xor i8 [[TMP1]], [[Y]] +; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[COND:%.*]], i8 [[TMP2]], i8 7 ; CHECK-NEXT: [[R:%.*]] = icmp ule i8 [[X]], [[TMP3]] ; CHECK-NEXT: ret i1 [[R]] ; @@ -625,9 +625,7 @@ define i1 @src_is_notmask_xor_fail(i8 %x_in, i8 %y) { define i1 @src_is_mask_const_slt(i8 %x_in) { ; CHECK-LABEL: @src_is_mask_const_slt( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 7 -; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[X]], [[AND]] +; CHECK-NEXT: [[R:%.*]] = icmp slt i8 [[X_IN:%.*]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 @@ -650,9 +648,7 @@ define i1 @src_is_mask_const_sgt(i8 %x_in) { define i1 @src_is_mask_const_sle(i8 %x_in) { ; CHECK-LABEL: @src_is_mask_const_sle( -; CHECK-NEXT: [[X:%.*]] = xor i8 [[X_IN:%.*]], 123 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 31 -; CHECK-NEXT: [[R:%.*]] = icmp sle i8 [[AND]], [[X]] +; CHECK-NEXT: [[R:%.*]] = icmp sgt i8 [[X_IN:%.*]], -1 ; CHECK-NEXT: ret i1 [[R]] ; %x = xor i8 %x_in, 123 diff --git a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll index 0f26be1..75badab 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-and-x.ll @@ -58,7 +58,7 @@ define i1 @icmp_sge_x_negy(i8 %x, i8 %y) { ; CHECK-NEXT: [[CY:%.*]] = icmp slt i8 [[Y:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CY]]) ; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[Z:%.*]] = icmp sge i8 [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp eq i8 [[AND]], [[X]] ; CHECK-NEXT: ret i1 [[Z]] ; %cy = icmp slt i8 %y, 0 @@ -74,7 +74,7 @@ define i1 @icmp_slt_x_negy(i8 %x, i8 %y) { ; CHECK-NEXT: br i1 [[CY]], label [[NEGY:%.*]], label [[POSY:%.*]] ; CHECK: negy: ; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[Z:%.*]] = icmp slt i8 [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp ne i8 [[AND]], [[X]] ; CHECK-NEXT: ret i1 [[Z]] ; CHECK: posy: ; CHECK-NEXT: [[R:%.*]] = call i1 @barrier() @@ -116,10 +116,7 @@ posy: define i1 @icmp_sle_x_negy(i8 %x, i8 %yy) { ; CHECK-LABEL: @icmp_sle_x_negy( -; CHECK-NEXT: [[Y:%.*]] = or i8 [[YY:%.*]], -128 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[Y]], [[X:%.*]] -; CHECK-NEXT: [[Z:%.*]] = icmp sle i8 [[AND]], [[X]] -; CHECK-NEXT: ret i1 [[Z]] +; CHECK-NEXT: ret i1 true ; %y = or i8 %yy, 128 %and = and i8 %y, %x @@ -129,10 +126,7 @@ define i1 @icmp_sle_x_negy(i8 %x, i8 %yy) { define <2 x i1> @icmp_sgt_x_negy(<2 x i8> %x, <2 x i8> %yy) { ; CHECK-LABEL: @icmp_sgt_x_negy( -; CHECK-NEXT: [[Y:%.*]] = or <2 x i8> [[YY:%.*]], <i8 -128, i8 -128> -; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[Y]], [[X:%.*]] -; CHECK-NEXT: [[Z:%.*]] = icmp sgt <2 x i8> [[AND]], [[X]] -; CHECK-NEXT: ret <2 x i1> [[Z]] +; CHECK-NEXT: ret <2 x i1> zeroinitializer ; %y = or <2 x i8> %yy, <i8 128, i8 128> %and = and <2 x i8> %y, %x @@ -155,9 +149,7 @@ define <2 x i1> @icmp_sgt_x_negy_fail_partial(<2 x i8> %x, <2 x i8> %yy) { define <2 x i1> @icmp_sle_x_posy(<2 x i8> %x, <2 x i8> %yy) { ; CHECK-LABEL: @icmp_sle_x_posy( -; CHECK-NEXT: [[Y:%.*]] = and <2 x i8> [[YY:%.*]], <i8 127, i8 127> -; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[Y]], [[X:%.*]] -; CHECK-NEXT: [[Z:%.*]] = icmp sle <2 x i8> [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp sgt <2 x i8> [[X:%.*]], <i8 -1, i8 -1> ; CHECK-NEXT: ret <2 x i1> [[Z]] ; %y = and <2 x i8> %yy, <i8 127, i8 127> @@ -183,8 +175,7 @@ define i1 @icmp_sgt_x_posy(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_sgt_x_posy( ; CHECK-NEXT: [[CY:%.*]] = icmp sgt i8 [[Y:%.*]], -1 ; CHECK-NEXT: call void @llvm.assume(i1 [[CY]]) -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X:%.*]], [[Y]] -; CHECK-NEXT: [[Z:%.*]] = icmp sgt i8 [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp slt i8 [[X:%.*]], 0 ; CHECK-NEXT: ret i1 [[Z]] ; %cy = icmp sge i8 %y, 0 @@ -196,9 +187,7 @@ define i1 @icmp_sgt_x_posy(i8 %x, i8 %y) { define <2 x i1> @icmp_sgt_negx_y(<2 x i8> %xx, <2 x i8> %y) { ; CHECK-LABEL: @icmp_sgt_negx_y( -; CHECK-NEXT: [[X:%.*]] = or <2 x i8> [[XX:%.*]], <i8 -128, i8 -128> -; CHECK-NEXT: [[AND:%.*]] = and <2 x i8> [[X]], [[Y:%.*]] -; CHECK-NEXT: [[Z:%.*]] = icmp sgt <2 x i8> [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp sgt <2 x i8> [[Y:%.*]], <i8 -1, i8 -1> ; CHECK-NEXT: ret <2 x i1> [[Z]] ; %x = or <2 x i8> %xx, <i8 128, i8 128> @@ -211,8 +200,7 @@ define i1 @icmp_sle_negx_y(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_sle_negx_y( ; CHECK-NEXT: [[CX:%.*]] = icmp slt i8 [[X:%.*]], 0 ; CHECK-NEXT: call void @llvm.assume(i1 [[CX]]) -; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], [[Y:%.*]] -; CHECK-NEXT: [[Z:%.*]] = icmp sle i8 [[AND]], [[X]] +; CHECK-NEXT: [[Z:%.*]] = icmp slt i8 [[Y:%.*]], 0 ; CHECK-NEXT: ret i1 [[Z]] ; %cx = icmp slt i8 %x, 0 @@ -239,9 +227,9 @@ define i1 @icmp_sle_negx_y_fail_maybe_zero(i8 %x, i8 %y) { define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y_todo( -; CHECK-NEXT: [[YY:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 -25 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %yy = select i1 %y, i8 7, i8 24 @@ -252,8 +240,8 @@ define i1 @icmp_eq_x_invertable_y_todo(i8 %x, i1 %y) { define i1 @icmp_eq_x_invertable_y(i8 %x, i8 %y) { ; CHECK-LABEL: @icmp_eq_x_invertable_y( -; CHECK-NEXT: [[AND:%.*]] = and i8 [[YY:%.*]], [[X:%.*]] -; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[AND]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], 0 ; CHECK-NEXT: ret i1 [[R]] ; %yy = xor i8 %y, -1 diff --git a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll index 26f53cb..3048746 100644 --- a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll +++ b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll @@ -373,3 +373,29 @@ define i1 @pr64610(ptr %b) { %r = icmp ugt i32 %or, %s ret i1 %r } + +define i1 @icmp_eq_x_invertable_y2_todo(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @icmp_eq_x_invertable_y2_todo( +; CHECK-NEXT: [[TMP1:%.*]] = select i1 [[Y:%.*]], i8 -8, i8 [[Z:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP2]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %zz = xor i8 %z, -1 + %yy = select i1 %y, i8 7, i8 %zz + %or = or i8 %x, %yy + %r = icmp eq i8 %yy, %or + ret i1 %r +} + +define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) { +; CHECK-LABEL: @icmp_eq_x_invertable_y2( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: [[R:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[R]] +; + %yy = xor i8 %y, -1 + %or = or i8 %x, %yy + %r = icmp eq i8 %yy, %or + ret i1 %r +} diff --git a/llvm/test/Transforms/InstCombine/select.ll b/llvm/test/Transforms/InstCombine/select.ll index d3cf2af..a0ee438 100644 --- a/llvm/test/Transforms/InstCombine/select.ll +++ b/llvm/test/Transforms/InstCombine/select.ll @@ -2787,11 +2787,23 @@ define <2 x i8> @select_replacement_add_eq_vec_nonuniform(<2 x i8> %x, <2 x i8> define <2 x i8> @select_replacement_add_eq_vec_poison(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @select_replacement_add_eq_vec_poison( ; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 1, i8 poison> +; CHECK-NEXT: [[SEL:%.*]] = select <2 x i1> [[CMP]], <2 x i8> <i8 2, i8 poison>, <2 x i8> [[Y:%.*]] +; CHECK-NEXT: ret <2 x i8> [[SEL]] +; + %cmp = icmp eq <2 x i8> %x, <i8 1, i8 poison> + %add = add <2 x i8> %x, <i8 1, i8 1> + %sel = select <2 x i1> %cmp, <2 x i8> %add, <2 x i8> %y + ret <2 x i8> %sel +} + +define <2 x i8> @select_replacement_add_eq_vec_undef(<2 x i8> %x, <2 x i8> %y) { +; CHECK-LABEL: @select_replacement_add_eq_vec_undef( +; CHECK-NEXT: [[CMP:%.*]] = icmp eq <2 x i8> [[X:%.*]], <i8 1, i8 undef> ; CHECK-NEXT: [[ADD:%.*]] = add <2 x i8> [[X]], <i8 1, i8 1> ; CHECK-NEXT: [[SEL:%.*]] = select <2 x i1> [[CMP]], <2 x i8> [[ADD]], <2 x i8> [[Y:%.*]] ; CHECK-NEXT: ret <2 x i8> [[SEL]] ; - %cmp = icmp eq <2 x i8> %x, <i8 1, i8 poison> + %cmp = icmp eq <2 x i8> %x, <i8 1, i8 undef> %add = add <2 x i8> %x, <i8 1, i8 1> %sel = select <2 x i1> %cmp, <2 x i8> %add, <2 x i8> %y ret <2 x i8> %sel @@ -2835,6 +2847,20 @@ define i8 @select_replacement_sub_noundef(i8 %x, i8 noundef %y, i8 %z) { ret i8 %sel } +define i8 @select_replacement_sub_noundef_but_may_be_poison(i8 %x, i8 noundef %yy, i8 %z) { +; CHECK-LABEL: @select_replacement_sub_noundef_but_may_be_poison( +; CHECK-NEXT: [[Y:%.*]] = shl nuw i8 [[YY:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i8 [[Y]], [[X:%.*]] +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[CMP]], i8 0, i8 [[Z:%.*]] +; CHECK-NEXT: ret i8 [[SEL]] +; + %y = shl nuw i8 %yy, 1 + %cmp = icmp eq i8 %x, %y + %sub = sub i8 %x, %y + %sel = select i1 %cmp, i8 %sub, i8 %z + ret i8 %sel +} + ; TODO: The transform is also safe without noundef. define i8 @select_replacement_sub(i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: @select_replacement_sub( diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll index 323e242..64e8a6b 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/lsr-reuse.ll @@ -2,7 +2,7 @@ declare void @foo(i64) -; Verify that redundant adds aren't inserted by LSR. +; Verify that redundant adds or geps aren't inserted by LSR. ; CHECK-LABEL: @bar( define void @bar(ptr %A) { entry: @@ -10,9 +10,11 @@ entry: while.cond: ; CHECK-LABEL: while.cond: -; CHECK: add i64 %lsr.iv, 1 ; CHECK-NOT: add i64 %lsr.iv, 1 ; CHECK-LABEL: land.rhs: +; CHECK: getelementptr i8, ptr %lsr.iv, i64 -8 +; CHECK-NOT: getelementptr i8, ptr %lsr.iv, i64 -8 +; CHECK-NOT: add i64, %lsr.iv, 1 %indvars.iv28 = phi i64 [ %indvars.iv.next29, %land.rhs ], [ 50, %entry ] %cmp = icmp sgt i64 %indvars.iv28, 0 br i1 %cmp, label %land.rhs, label %while.end diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll new file mode 100644 index 0000000..4914bb7 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/many-geps.ll @@ -0,0 +1,109 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -loop-reduce -S | FileCheck %s + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" +target triple = "riscv64" + +; This test was added as example motivation for the changes in #89927, which +; causes LSR to drop solutions if deemed to be less profitable than the +; starting point. At the time of adding this test, LSR's search heuristics +; best identified solution was an unprofitable one. This could of course +; change with future LSR improvements. + +%struct = type { i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, i64, i64, i32, i64 } + +define i32 @main() { +; CHECK-LABEL: define i32 @main() { +; CHECK-NEXT: [[CALL:%.*]] = tail call ptr null(i64 0) +; CHECK-NEXT: br label %[[BB2:.*]] +; CHECK: [[BB1:.*:]] +; CHECK-NEXT: [[LOAD:%.*]] = load i32, ptr [[CALL]], align 4 +; CHECK-NEXT: ret i32 0 +; CHECK: [[BB2]]: +; CHECK-NEXT: [[LSR_IV30:%.*]] = phi i64 [ [[LSR_IV_NEXT31:%.*]], %[[BB2]] ], [ 8, [[BB:%.*]] ] +; CHECK-NEXT: [[LSR_IV27:%.*]] = phi i64 [ [[LSR_IV_NEXT28:%.*]], %[[BB2]] ], [ 12, [[BB]] ] +; CHECK-NEXT: [[LSR_IV24:%.*]] = phi i64 [ [[LSR_IV_NEXT25:%.*]], %[[BB2]] ], [ 16, [[BB]] ] +; CHECK-NEXT: [[LSR_IV21:%.*]] = phi i64 [ [[LSR_IV_NEXT22:%.*]], %[[BB2]] ], [ 20, [[BB]] ] +; CHECK-NEXT: [[LSR_IV18:%.*]] = phi i64 [ [[LSR_IV_NEXT19:%.*]], %[[BB2]] ], [ 24, [[BB]] ] +; CHECK-NEXT: [[LSR_IV15:%.*]] = phi i64 [ [[LSR_IV_NEXT16:%.*]], %[[BB2]] ], [ 28, [[BB]] ] +; CHECK-NEXT: [[LSR_IV12:%.*]] = phi i64 [ [[LSR_IV_NEXT13:%.*]], %[[BB2]] ], [ 32, [[BB]] ] +; CHECK-NEXT: [[LSR_IV9:%.*]] = phi i64 [ [[LSR_IV_NEXT10:%.*]], %[[BB2]] ], [ 36, [[BB]] ] +; CHECK-NEXT: [[LSR_IV4:%.*]] = phi i64 [ [[LSR_IV_NEXT5:%.*]], %[[BB2]] ], [ 40, [[BB]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi i64 [ [[LSR_IV_NEXT2:%.*]], %[[BB2]] ], [ 48, [[BB]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[BB2]] ], [ 72, [[BB]] ] +; CHECK-NEXT: [[SCEVGEP32:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV30]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP32]], align 8 +; CHECK-NEXT: [[SCEVGEP29:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV27]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP29]], align 4 +; CHECK-NEXT: [[SCEVGEP26:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV24]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP26]], align 8 +; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV21]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP23]], align 4 +; CHECK-NEXT: [[SCEVGEP20:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV18]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP20]], align 8 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV15]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP17]], align 4 +; CHECK-NEXT: [[SCEVGEP14:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV12]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP14]], align 8 +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV9]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP11]], align 4 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV4]] +; CHECK-NEXT: store i64 0, ptr [[SCEVGEP6]], align 8 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV1]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP3]], align 8 +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV]] +; CHECK-NEXT: store i32 0, ptr [[SCEVGEP]], align 8 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[CALL]], i64 [[LSR_IV4]] +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 40 +; CHECK-NEXT: store i64 0, ptr [[SCEVGEP8]], align 8 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT2]] = add i64 [[LSR_IV1]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT5]] = add i64 [[LSR_IV4]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT10]] = add i64 [[LSR_IV9]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT13]] = add i64 [[LSR_IV12]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT16]] = add i64 [[LSR_IV15]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT19]] = add i64 [[LSR_IV18]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT22]] = add i64 [[LSR_IV21]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT25]] = add i64 [[LSR_IV24]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT28]] = add i64 [[LSR_IV27]], 88 +; CHECK-NEXT: [[LSR_IV_NEXT31]] = add i64 [[LSR_IV30]], 88 +; CHECK-NEXT: br label %[[BB2]] +; +0: + %call = tail call ptr null(i64 0) + br label %2 + +1: + %load = load i32, ptr %call, align 4 + ret i32 0 + +2: + %phi = phi i64 [ 0, %0 ], [ %add, %2 ] + %getelementptr = getelementptr %struct, ptr %call, i64 %phi + %getelementptr3 = getelementptr i8, ptr %getelementptr, i64 8 + store i32 0, ptr %getelementptr3, align 8 + %getelementptr4 = getelementptr i8, ptr %getelementptr, i64 12 + store i32 0, ptr %getelementptr4, align 4 + %getelementptr5 = getelementptr i8, ptr %getelementptr, i64 16 + store i32 0, ptr %getelementptr5, align 8 + %getelementptr6 = getelementptr i8, ptr %getelementptr, i64 20 + store i32 0, ptr %getelementptr6, align 4 + %getelementptr7 = getelementptr i8, ptr %getelementptr, i64 24 + store i32 0, ptr %getelementptr7, align 8 + %getelementptr8 = getelementptr i8, ptr %getelementptr, i64 28 + store i32 0, ptr %getelementptr8, align 4 + %getelementptr9 = getelementptr i8, ptr %getelementptr, i64 32 + store i32 0, ptr %getelementptr9, align 8 + %getelementptr10 = getelementptr i8, ptr %getelementptr, i64 36 + store i32 0, ptr %getelementptr10, align 4 + %getelementptr11 = getelementptr i8, ptr %getelementptr, i64 40 + store i64 0, ptr %getelementptr11, align 8 + %getelementptr12 = getelementptr i8, ptr %getelementptr, i64 48 + store i32 0, ptr %getelementptr12, align 8 + %getelementptr13 = getelementptr i8, ptr %getelementptr, i64 72 + store i32 0, ptr %getelementptr13, align 8 + %getelementptr14 = getelementptr i8, ptr %getelementptr, i64 80 + store i64 0, ptr %getelementptr14, align 8 + %add = add i64 %phi, 1 + br label %2 +} diff --git a/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll new file mode 100644 index 0000000..7fd4eb1 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/convergent.controlled.ll @@ -0,0 +1,562 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=loop-unroll -unroll-runtime -unroll-allow-partial -S | FileCheck %s + +declare void @f() convergent +declare void @g() + +; Although this loop contains a convergent instruction, it should be +; fully unrolled. +define i32 @full_unroll() { +; CHECK-LABEL: @full_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 3 + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, but it should be partially +; unrolled. The unroll count is the largest power of 2 that divides the +; multiple -- 4, in this case. +define i32 @runtime_unroll(i32 %n) { +; CHECK-LABEL: @runtime_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 12 +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_3:%.*]], [[A_3:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_3]] +; CHECK: a.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_3]] = add nsw i32 [[X_0]], 4 +; CHECK-NEXT: [[EXITCOND_3:%.*]] = icmp eq i32 [[INC_3]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_3]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 12 + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction, so its partial unroll +; count must divide its trip multiple. This overrides its unroll +; pragma -- we unroll exactly 8 times, even though 16 is requested. +define i32 @pragma_unroll(i32 %n) { +; CHECK-LABEL: @pragma_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 24 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_7:%.*]], [[A_7:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: br label [[A:%.*]] +; CHECK: a: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_1:%.*]] +; CHECK: a.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_2:%.*]] +; CHECK: a.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_3:%.*]] +; CHECK: a.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_4:%.*]] +; CHECK: a.4: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_5:%.*]] +; CHECK: a.5: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_6:%.*]] +; CHECK: a.6: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: br label [[A_7]] +; CHECK: a.7: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_7]] = add nsw i32 [[X_0]], 8 +; CHECK-NEXT: [[EXITCOND_7:%.*]] = icmp eq i32 [[INC_7]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_7]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 24 + br label %l3, !llvm.loop !0 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %a ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + br label %a + +a: + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !0 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip count 4. The loop unroll should respect the pragma. +define void @pragma_unroll_divisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_divisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[X_0]], 2 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], 4 +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret void +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 divides trip multiple 2. The loop unroll should respect the pragma. +define i32 @pragma_unroll_divisible_trip_multiple(i32 %n) { +; CHECK-LABEL: @pragma_unroll_divisible_trip_multiple( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[LOOP_CTL:%.*]] = mul nsw i32 [[N:%.*]], 2 +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[X_0]], 2 +; CHECK-NEXT: [[EXITCOND_1:%.*]] = icmp eq i32 [[INC_1]], [[LOOP_CTL]] +; CHECK-NEXT: br i1 [[EXITCOND_1]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + %loop_ctl = mul nsw i32 %n, 2 + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %loop_ctl + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 is unknown to divide runtime trip count, the loop is not unrolled +; since remainder is forbidden for unrolling convergent loop. +define i32 @pragma_unroll_indivisible_runtime_trip_count(i32 %n) { +; CHECK-LABEL: @pragma_unroll_indivisible_runtime_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction. Since the pragma loop unroll +; count 2 does not divide trip count 5, the loop is not unrolled by 2 +; since remainder is forbidden for unrolling convergent loop. Instead, the +; loop gets fully unrolled. +define i32 @pragma_unroll_indivisible_trip_count() { +; CHECK-LABEL: @pragma_unroll_indivisible_trip_count( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ANCHOR:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token [[ANCHOR]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + %anchor = call token @llvm.experimental.convergence.anchor() + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.loop() [ "convergencectrl"(token %anchor) ] + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 5 + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; This loop contains a convergent instruction that is anchored inside the loop +; itself. It is unrolled by 2 with remainder, as requested by the loop metadata. +define i32 @pragma_unroll_with_remainder(i32 %n) { +; CHECK-LABEL: @pragma_unroll_with_remainder( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = freeze i32 [[N:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[TMP0]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[TMP0]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[TMP1]], 1 +; CHECK-NEXT: br i1 [[TMP2]], label [[EXIT_UNR_LCSSA:%.*]], label [[ENTRY_NEW:%.*]] +; CHECK: entry.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[TMP0]], [[XTRAITER]] +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[INC_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ 0, [[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: [[TOK_LOOP_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_1]]) ] +; CHECK-NEXT: [[INC_1]] = add nsw i32 [[X_0]], 2 +; CHECK-NEXT: [[NITER_NEXT_1]] = add i32 [[NITER]], 2 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i32 [[NITER_NEXT_1]], [[UNROLL_ITER]] +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[EXIT_UNR_LCSSA_LOOPEXIT:%.*]], label [[L3]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: exit.unr-lcssa.loopexit: +; CHECK-NEXT: br label [[EXIT_UNR_LCSSA]] +; CHECK: exit.unr-lcssa: +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[L3_EPIL_PREHEADER:%.*]], label [[EXIT:%.*]] +; CHECK: l3.epil.preheader: +; CHECK-NEXT: br label [[L3_EPIL:%.*]] +; CHECK: l3.epil: +; CHECK-NEXT: [[TOK_LOOP_EPIL:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP_EPIL]]) ] +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.loop) ] + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + ret i32 0 +} + +; Don't unroll a loop that is extended by convergence controls. +; +; We could theoretically duplicate the extension part, but this is not +; implemented. +define i32 @extended_loop(i32 %n) { +; CHECK-LABEL: @extended_loop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[L3]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]], !llvm.loop [[LOOP4]] +; CHECK: exit: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_LOOP]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3, !llvm.loop !1 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %exit, label %l3, !llvm.loop !1 + +exit: + call void @f() [ "convergencectrl"(token %tok.loop) ] + ret i32 0 +} + +; Inner loop is extended beyond the outer loop. No unrolling possible. + +define i32 @extended_inner_loop_1(i32 %n, i1 %cond) { +; CHECK-LABEL: @extended_inner_loop_1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: [[X_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[TOK_LOOP:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: [[INC]] = add nsw i32 [[X_0]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 4 +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP4]] +; CHECK: latch: +; CHECK-NEXT: br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[L3]] +; CHECK: exit: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + br i1 %exitcond, label %exit, label %l3 + +exit: + call void @f() [ "convergencectrl"(token %tok.l2) ] + ret i32 0 +} + +; Inner loop is extended inside the outer loop. Outer loop is unrolled. + +define i32 @extended_inner_loop_2(i32 %n, i1 %cond) { +; CHECK-LABEL: @extended_inner_loop_2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br label [[L2_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1: +; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.1: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.2: +; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.2: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.3: +; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: latch.3: +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +; No extension. Both loops unrolled. + +define i32 @unroll_nest(i32 %n, i1 %cond) { +; CHECK-LABEL: @unroll_nest( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[L3:%.*]] +; CHECK: l3: +; CHECK-NEXT: br label [[L2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2: +; CHECK-NEXT: [[TOK_L2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2]]) ] +; CHECK-NEXT: br i1 [[COND:%.*]], label [[L2_1:%.*]], label [[LATCH:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1: +; CHECK-NEXT: [[TOK_L2_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2]], label [[LATCH]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: latch: +; CHECK-NEXT: br label [[L2_12:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.12: +; CHECK-NEXT: [[TOK_L2_11:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_11]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_1:%.*]], label [[LATCH_1:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.1: +; CHECK-NEXT: [[TOK_L2_1_1:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_1]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_12]], label [[LATCH_1]], !llvm.loop [[LOOP9]] +; CHECK: latch.1: +; CHECK-NEXT: br label [[L2_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.2: +; CHECK-NEXT: [[TOK_L2_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_2:%.*]], label [[LATCH_2:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.2: +; CHECK-NEXT: [[TOK_L2_1_2:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_2]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_2]], label [[LATCH_2]], !llvm.loop [[LOOP9]] +; CHECK: latch.2: +; CHECK-NEXT: br label [[L2_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.3: +; CHECK-NEXT: [[TOK_L2_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_1_3:%.*]], label [[LATCH_3:%.*]], !llvm.loop [[LOOP4]] +; CHECK: l2.1.3: +; CHECK-NEXT: [[TOK_L2_1_3:%.*]] = call token @llvm.experimental.convergence.anchor() +; CHECK-NEXT: call void @f() [ "convergencectrl"(token [[TOK_L2_1_3]]) ] +; CHECK-NEXT: br i1 [[COND]], label [[L2_3]], label [[LATCH_3]], !llvm.loop [[LOOP9]] +; CHECK: latch.3: +; CHECK-NEXT: ret i32 0 +; +entry: + br label %l3 + +l3: + %x.0 = phi i32 [ 0, %entry ], [ %inc, %latch ] + %tok.loop = call token @llvm.experimental.convergence.anchor() + %inc = add nsw i32 %x.0, 1 + %exitcond = icmp eq i32 %inc, 4 + br label %l2, !llvm.loop !1 + +l2: + %tok.l2 = call token @llvm.experimental.convergence.anchor() + call void @f() [ "convergencectrl"(token %tok.l2) ] + br i1 %cond, label %l2, label %latch, !llvm.loop !1 + +latch: + br i1 %exitcond, label %exit, label %l3 + +exit: + ret i32 0 +} + +declare token @llvm.experimental.convergence.anchor() +declare token @llvm.experimental.convergence.loop() + +!0 = !{!0, !{!"llvm.loop.unroll.count", i32 16}} +!1 = !{!1, !{!"llvm.loop.unroll.count", i32 2}} diff --git a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll index 8d2820a..1627292 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/amdlibm-calls.ll @@ -15,6 +15,11 @@ declare float @cosf(float) #0 declare double @llvm.cos.f64(double) #0 declare float @llvm.cos.f32(float) #0 +declare double @tan(double) #0 +declare float @tanf(float) #0 +declare double @llvm.tan.f64(double) #0 +declare float @llvm.tan.f32(float) #0 + declare double @pow(double, double) #0 declare float @powf(float, float) #0 declare double @llvm.pow.f64(double, double) #0 @@ -264,6 +269,114 @@ for.end: ret void } +define void @tan_f64(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f64( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @tan_f64( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @tan(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @tan_f32(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f32( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @tan_f32( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @tanf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @tan_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @amd_vrd4_tan(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF8-LABEL: @tan_f64_intrinsic( +; CHECK-AVX512-VF8: [[TMP5:%.*]] = call <8 x double> @amd_vrd8_tan(<8 x double> [[TMP4:%.*]]) +; CHECK-AVX512-VF8: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.tan.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @tan_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @amd_vrs4_tanf(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +; CHECK-AVX512-VF16-LABEL: @tan_f32_intrinsic( +; CHECK-AVX512-VF16: [[TMP5:%.*]] = call <16 x float> @amd_vrs16_tanf(<16 x float> [[TMP4:%.*]]) +; CHECK-AVX512-VF16: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.tan.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64( ; CHECK: [[TMP8:%.*]] = call <4 x double> @amd_vrd4_pow(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll index 038852f..67a2cf2 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/libm-vector-calls-VF2-VF8.ll @@ -356,6 +356,117 @@ for.end: ; preds = %for.body !132 = !{!"llvm.loop.vectorize.width", i32 8} !133 = !{!"llvm.loop.vectorize.enable", i1 true} +define void @tan_f64(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f64( +; CHECK-LABEL: vector.body +; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_tan(<2 x double> [[TMP4:%.*]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @tan(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret void +} + +!141 = distinct !{!141, !142, !143} +!142 = !{!"llvm.loop.vectorize.width", i32 2} +!143 = !{!"llvm.loop.vectorize.enable", i1 true} + + +define void @tan_f32(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f32( +; CHECK-LABEL: vector.body +; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_tanf(<8 x float> [[TMP4:%.*]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @tanf(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !21 + +for.end: + ret void +} + +!151 = distinct !{!151, !152, !153} +!152 = !{!"llvm.loop.vectorize.width", i32 8} +!153 = !{!"llvm.loop.vectorize.enable", i1 true} + +define void @tan_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f64_intrinsic( +; CHECK-LABEL: vector.body +; CHECK: [[TMP5:%.*]] = call <2 x double> @_ZGVbN2v_tan(<2 x double> [[TMP4:%.*]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.tan.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !31 + +for.end: + ret void +} + +!161 = distinct !{!161, !162, !163} +!162 = !{!"llvm.loop.vectorize.width", i32 2} +!163 = !{!"llvm.loop.vectorize.enable", i1 true} + +define void @tan_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f32_intrinsic( +; CHECK-LABEL: vector.body +; CHECK: [[TMP5:%.*]] = call <8 x float> @_ZGVdN8v_tanf(<8 x float> [[TMP4:%.*]]) +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.tan.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !41 + +for.end: + ret void +} + + + +!171 = distinct !{!171, !172, !173} +!172 = !{!"llvm.loop.vectorize.width", i32 8} +!173 = !{!"llvm.loop.vectorize.enable", i1 true} + attributes #0 = { nounwind readnone } declare double @sin(double) #0 @@ -366,6 +477,10 @@ declare double @cos(double) #0 declare float @cosf(float) #0 declare double @llvm.cos.f64(double) #0 declare float @llvm.cos.f32(float) #0 +declare double @tan(double) #0 +declare float @tanf(float) #0 +declare double @llvm.tan.f64(double) #0 +declare float @llvm.tan.f32(float) #0 declare float @expf(float) #0 declare float @powf(float, float) #0 declare float @llvm.exp.f32(float) #0 diff --git a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll index 005557d..2e78e36 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/svml-calls.ll @@ -230,6 +230,52 @@ for.end: ret void } +define void @tan_f64_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f64_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x double> @__svml_tan4(<4 x double> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to double + %call = tail call double @llvm.tan.f64(double %conv) + %arrayidx = getelementptr inbounds double, ptr %varray, i64 %iv + store double %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + +define void @tan_f32_intrinsic(ptr nocapture %varray) { +; CHECK-LABEL: @tan_f32_intrinsic( +; CHECK: [[TMP5:%.*]] = call <4 x float> @__svml_tanf4(<4 x float> [[TMP4:%.*]]) +; CHECK: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %tmp = trunc i64 %iv to i32 + %conv = sitofp i32 %tmp to float + %call = tail call float @llvm.tan.f32(float %conv) + %arrayidx = getelementptr inbounds float, ptr %varray, i64 %iv + store float %call, ptr %arrayidx, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond = icmp eq i64 %iv.next, 1000 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} + define void @pow_f64(ptr nocapture %varray, ptr nocapture readonly %exp) { ; CHECK-LABEL: @pow_f64( ; CHECK: [[TMP8:%.*]] = call <4 x double> @__svml_pow4(<4 x double> [[TMP4:%.*]], <4 x double> [[WIDE_LOAD:%.*]]) diff --git a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll index 2e78a96..27038f3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/veclib-calls.ll @@ -406,6 +406,31 @@ for.end: ; preds = %for.body, %entry ret void } +;CHECK-LABEL: @tan_f32_intrinsic( +;CHECK: vtanf{{.*}}<4 x float> +;CHECK: ret void +declare float @llvm.tan.f32(float) nounwind readnone +define void @tan_f32_intrinsic(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable { +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv + %0 = load float, ptr %arrayidx, align 4 + %call = tail call float @llvm.tan.f32(float %0) nounwind readnone + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv + store float %call, ptr %arrayidx2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + ;CHECK-LABEL: @asin_f32( ;CHECK: vasinf{{.*}}<4 x float> ;CHECK: ret void diff --git a/llvm/test/tools/llvm-cov/gcov/intermediate-format.test b/llvm/test/tools/llvm-cov/gcov/intermediate-format.test index 583e670..a3f4695 100644 --- a/llvm/test/tools/llvm-cov/gcov/intermediate-format.test +++ b/llvm/test/tools/llvm-cov/gcov/intermediate-format.test @@ -1,5 +1,3 @@ -REQUIRES: shell - RUN: rm -rf %t && mkdir %t && cd %t RUN: cp %S/Inputs/test.gcno %S/Inputs/test.gcda . diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s index bd7a489..7150a58 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx1.s @@ -1455,20 +1455,20 @@ vzeroupper # CHECK-NEXT: 3 2 1.00 * vpextrq $1, %xmm0, (%rax) # CHECK-NEXT: 2 3 1.00 vpextrw $1, %xmm0, %ecx # CHECK-NEXT: 3 2 1.00 * vpextrw $1, %xmm0, (%rax) -# CHECK-NEXT: 3 3 2.00 vphaddd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphaddd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 3 3 2.00 vphaddsw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphaddsw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 3 3 2.00 vphaddw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphaddw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphaddd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphaddd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphaddsw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphaddsw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphaddw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphaddw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 4 1.00 vphminposuw %xmm0, %xmm2 # CHECK-NEXT: 2 10 1.00 * vphminposuw (%rax), %xmm2 -# CHECK-NEXT: 3 3 2.00 vphsubd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphsubd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 3 3 2.00 vphsubsw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphsubsw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: 3 3 2.00 vphsubw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: 4 9 2.00 * vphsubw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphsubd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphsubd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphsubsw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphsubsw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: 3 3 1.00 vphsubw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: 4 9 1.00 * vphsubw (%rax), %xmm1, %xmm2 # CHECK-NEXT: 2 2 2.00 vpinsrb $1, %eax, %xmm1, %xmm2 # CHECK-NEXT: 2 6 1.00 * vpinsrb $1, (%rax), %xmm1, %xmm2 # CHECK-NEXT: 2 2 2.00 vpinsrd $1, %eax, %xmm1, %xmm2 @@ -1738,7 +1738,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - 126.00 322.92 237.92 160.50 160.50 19.00 291.92 6.25 19.00 19.00 19.00 +# CHECK-NEXT: - 126.00 325.58 252.58 160.50 160.50 19.00 274.58 6.25 19.00 19.00 19.00 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -1908,22 +1908,22 @@ vzeroupper # CHECK-NEXT: - - - - - - 0.50 - - 0.50 0.50 0.50 vextractf128 $1, %ymm0, (%rax) # CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - vextractps $1, %xmm0, %ecx # CHECK-NEXT: - - - - - - 0.50 1.00 - 0.50 0.50 0.50 vextractps $1, %xmm0, (%rax) -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhaddpd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhaddpd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhaddpd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhaddpd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhaddps (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhaddps (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhsubpd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhsubpd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhsubpd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhsubpd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhsubps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhsubps (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vhsubps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vhsubps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhaddpd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhaddpd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhaddpd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhaddpd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhaddps (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhaddps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhsubpd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhsubpd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhsubpd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhsubpd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhsubps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhsubps (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vhsubps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vhsubps (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - - - vinsertf128 $1, %xmm0, %ymm1, %ymm2 # CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 0.33 - - - - vinsertf128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - - - vinsertps $1, %xmm0, %xmm1, %xmm2 @@ -2167,20 +2167,20 @@ vzeroupper # CHECK-NEXT: - - - - - - 0.50 1.00 - 0.50 0.50 0.50 vpextrq $1, %xmm0, (%rax) # CHECK-NEXT: - - 1.00 - - - - 1.00 - - - - vpextrw $1, %xmm0, %ecx # CHECK-NEXT: - - - - - - 0.50 1.00 - 0.50 0.50 0.50 vpextrw $1, %xmm0, (%rax) -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphaddd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphaddd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vphaddsw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vphaddsw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphaddw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphaddw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphaddd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphaddd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - vphaddsw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - vphaddsw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphaddw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphaddw (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - 1.00 - - - - - - - - - vphminposuw %xmm0, %xmm2 # CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - - - vphminposuw (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphsubd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphsubd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vphsubsw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vphsubsw (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphsubw %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphsubw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphsubd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphsubd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - vphsubsw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - vphsubsw (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphsubw %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphsubw (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - 2.00 - - - - vpinsrb $1, %eax, %xmm1, %xmm2 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - vpinsrb $1, (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - - - - - 2.00 - - - - vpinsrd $1, %eax, %xmm1, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s index dcf8834..c251dc3 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx2.s @@ -576,18 +576,18 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: 5 20 2.00 * vpgatherqd %xmm0, (%rax,%ymm1,2), %xmm2 # CHECK-NEXT: 5 18 1.00 * vpgatherqq %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: 5 20 2.00 * vpgatherqq %ymm0, (%rax,%ymm1,2), %ymm2 -# CHECK-NEXT: 3 3 2.00 vphaddd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphaddd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 3 3 2.00 vphaddsw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphaddsw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 3 3 2.00 vphaddw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphaddw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 3 3 2.00 vphsubd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphsubd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 3 3 2.00 vphsubsw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphsubsw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: 3 3 2.00 vphsubw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: 4 10 2.00 * vphsubw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphaddd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphaddd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphaddsw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphaddsw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphaddw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphaddw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphsubd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphsubd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphsubsw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphsubsw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: 3 3 1.00 vphsubw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: 4 10 1.00 * vphsubw (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 5 0.50 vpmaddubsw %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 2 12 0.50 * vpmaddubsw (%rax), %ymm1, %ymm2 # CHECK-NEXT: 1 5 0.50 vpmaddwd %ymm0, %ymm1, %ymm2 @@ -778,7 +778,7 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - - 110.33 104.33 98.00 98.00 2.50 149.33 - 2.50 2.50 2.50 +# CHECK-NEXT: - - 110.33 116.33 98.00 98.00 2.50 137.33 - 2.50 2.50 2.50 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -898,18 +898,18 @@ vpxor (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - 1.33 0.33 2.00 2.00 - 1.33 - - - - vpgatherqd %xmm0, (%rax,%ymm1,2), %xmm2 # CHECK-NEXT: - - 1.33 0.33 1.00 1.00 - 1.33 - - - - vpgatherqq %xmm0, (%rax,%xmm1,2), %xmm2 # CHECK-NEXT: - - 1.33 0.33 2.00 2.00 - 1.33 - - - - vpgatherqq %ymm0, (%rax,%ymm1,2), %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphaddd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphaddd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vphaddsw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vphaddsw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphaddw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphaddw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphsubd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphsubd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - vphsubsw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - vphsubsw (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - vphsubw %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - vphsubw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphaddd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphaddd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - vphaddsw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - vphaddsw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphaddw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphaddw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphsubd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphsubd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - vphsubsw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - vphsubsw (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - vphsubw %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - vphsubw (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - 0.50 0.50 - - - - - - - - vpmaddubsw %ymm0, %ymm1, %ymm2 # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - - - vpmaddubsw (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - 0.50 0.50 - - - - - - - - vpmaddwd %ymm0, %ymm1, %ymm2 diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse3.s index 4d19424..0d075a9 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse3.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-sse3.s @@ -81,7 +81,7 @@ mwait # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - - 6.67 7.67 5.00 5.00 - 23.67 4.00 - - - +# CHECK-NEXT: - - 8.00 9.00 5.00 5.00 - 21.00 4.00 - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -89,14 +89,14 @@ mwait # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - - - addsubpd (%rax), %xmm2 # CHECK-NEXT: - - 0.50 0.50 - - - - - - - - addsubps %xmm0, %xmm2 # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - - - addsubps (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - haddpd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - haddpd (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - haddps %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - haddps (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - hsubpd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - hsubpd (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - hsubps %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - hsubps (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - haddpd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - haddpd (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - haddps %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - haddps (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - hsubpd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - hsubpd (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - hsubps %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - hsubps (%rax), %xmm2 # CHECK-NEXT: - - - - 0.50 0.50 - - - - - - lddqu (%rax), %xmm2 # CHECK-NEXT: - - 0.25 0.25 - - - 0.25 0.25 - - - monitor # CHECK-NEXT: - - - - - - - 1.00 - - - - movddup %xmm0, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s index 3a6668ce..d034cbd 100644 --- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s +++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-ssse3.s @@ -124,28 +124,28 @@ psignw (%rax), %xmm2 # CHECK-NEXT: 2 7 1.00 * palignr $1, (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phaddd %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phaddd (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phaddd %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phaddd (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phaddd %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phaddd (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phaddsw %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phaddsw (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phaddsw %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phaddsw (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phaddsw %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phaddsw (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phaddw %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phaddw (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phaddw %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phaddw (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phaddw %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phaddw (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phsubd %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phsubd (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phsubd %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phsubd (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phsubd %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phsubd (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phsubsw %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phsubsw (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phsubsw %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phsubsw (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phsubsw %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phsubsw (%rax), %xmm2 # CHECK-NEXT: 3 3 2.00 phsubw %mm0, %mm2 # CHECK-NEXT: 4 8 2.00 * phsubw (%rax), %mm2 -# CHECK-NEXT: 3 3 2.00 phsubw %xmm0, %xmm2 -# CHECK-NEXT: 4 9 2.00 * phsubw (%rax), %xmm2 +# CHECK-NEXT: 3 3 1.00 phsubw %xmm0, %xmm2 +# CHECK-NEXT: 4 9 1.00 * phsubw (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 pmaddubsw %mm0, %mm2 # CHECK-NEXT: 2 10 1.00 * pmaddubsw (%rax), %mm2 # CHECK-NEXT: 1 5 0.50 pmaddubsw %xmm0, %xmm2 @@ -187,7 +187,7 @@ psignw (%rax), %xmm2 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] -# CHECK-NEXT: - - 30.67 13.67 16.00 16.00 - 67.67 - - - - +# CHECK-NEXT: - - 30.67 25.67 16.00 16.00 - 55.67 - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] Instructions: @@ -209,28 +209,28 @@ psignw (%rax), %xmm2 # CHECK-NEXT: - - - - 0.50 0.50 - 1.00 - - - - palignr $1, (%rax), %xmm2 # CHECK-NEXT: - - 0.50 - - - - 2.50 - - - - phaddd %mm0, %mm2 # CHECK-NEXT: - - 0.50 - 0.50 0.50 - 2.50 - - - - phaddd (%rax), %mm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - phaddd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - phaddd (%rax), %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - phaddd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - phaddd (%rax), %xmm2 # CHECK-NEXT: - - 1.00 - - - - 2.00 - - - - phaddsw %mm0, %mm2 # CHECK-NEXT: - - 1.00 - 0.50 0.50 - 2.00 - - - - phaddsw (%rax), %mm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - phaddsw %xmm0, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - phaddsw (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - phaddsw %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - phaddsw (%rax), %xmm2 # CHECK-NEXT: - - 0.50 - - - - 2.50 - - - - phaddw %mm0, %mm2 # CHECK-NEXT: - - 0.50 - 0.50 0.50 - 2.50 - - - - phaddw (%rax), %mm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - phaddw %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - phaddw (%rax), %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - phaddw %xmm0, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - phaddw (%rax), %xmm2 # CHECK-NEXT: - - 0.50 - - - - 2.50 - - - - phsubd %mm0, %mm2 # CHECK-NEXT: - - 0.50 - 0.50 0.50 - 2.50 - - - - phsubd (%rax), %mm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - phsubd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - phsubd (%rax), %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - phsubd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - phsubd (%rax), %xmm2 # CHECK-NEXT: - - 1.00 - - - - 2.00 - - - - phsubsw %mm0, %mm2 # CHECK-NEXT: - - 1.00 - 0.50 0.50 - 2.00 - - - - phsubsw (%rax), %mm2 -# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - - - phsubsw %xmm0, %xmm2 -# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - - - phsubsw (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 1.50 - - - 1.00 - - - - phsubsw %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 1.50 0.50 0.50 - 1.00 - - - - phsubsw (%rax), %xmm2 # CHECK-NEXT: - - 0.50 - - - - 2.50 - - - - phsubw %mm0, %mm2 # CHECK-NEXT: - - 0.50 - 0.50 0.50 - 2.50 - - - - phsubw (%rax), %mm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - - - phsubw %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - - - phsubw (%rax), %xmm2 +# CHECK-NEXT: - - 0.33 1.33 - - - 1.33 - - - - phsubw %xmm0, %xmm2 +# CHECK-NEXT: - - 0.33 1.33 0.50 0.50 - 1.33 - - - - phsubw (%rax), %xmm2 # CHECK-NEXT: - - 1.00 - - - - - - - - - pmaddubsw %mm0, %mm2 # CHECK-NEXT: - - 1.00 - 0.50 0.50 - - - - - - pmaddubsw (%rax), %mm2 # CHECK-NEXT: - - 0.50 0.50 - - - - - - - - pmaddubsw %xmm0, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s index cabb002..f4904e4 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx1.s @@ -1736,7 +1736,7 @@ vzeroupper # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - 126.00 325.25 202.25 173.83 173.83 38.00 326.25 7.25 11.33 +# CHECK-NEXT: - 126.00 327.92 204.92 173.83 173.83 38.00 320.92 7.25 11.33 # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -1906,22 +1906,22 @@ vzeroupper # CHECK-NEXT: - - - - 0.33 0.33 1.00 - - 0.33 vextractf128 $1, %ymm0, (%rax) # CHECK-NEXT: - - 1.00 - - - - 1.00 - - vextractps $1, %xmm0, %ecx # CHECK-NEXT: - - - - 0.33 0.33 1.00 1.00 - 0.33 vextractps $1, %xmm0, (%rax) -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhaddpd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhaddpd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhaddpd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhaddpd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhaddps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhaddps (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhaddps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhaddps (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhsubpd %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhsubpd (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhsubpd %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhsubpd (%rax), %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhsubps %xmm0, %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhsubps (%rax), %xmm1, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - vhsubps %ymm0, %ymm1, %ymm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - vhsubps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhaddpd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhaddpd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhaddpd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhaddpd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhaddps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhaddps (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhaddps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhaddps (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhsubpd %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhsubpd (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhsubpd %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhsubpd (%rax), %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhsubps %xmm0, %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhsubps (%rax), %xmm1, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - vhsubps %ymm0, %ymm1, %ymm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - vhsubps (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - vinsertf128 $1, %xmm0, %ymm1, %ymm2 # CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 0.33 - - vinsertf128 $1, (%rax), %ymm1, %ymm2 # CHECK-NEXT: - - - - - - - 1.00 - - vinsertps $1, %xmm0, %xmm1, %xmm2 diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse3.s index e6bec19..0b6b035 100644 --- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse3.s +++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-sse3.s @@ -79,7 +79,7 @@ mwait # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] -# CHECK-NEXT: - - 6.67 6.67 5.00 5.00 - 24.67 4.00 - +# CHECK-NEXT: - - 8.00 8.00 5.00 5.00 - 22.00 4.00 - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: @@ -87,14 +87,14 @@ mwait # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - addsubpd (%rax), %xmm2 # CHECK-NEXT: - - 0.50 0.50 - - - - - - addsubps %xmm0, %xmm2 # CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - addsubps (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - haddpd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - haddpd (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - haddps %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - haddps (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - hsubpd %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - hsubpd (%rax), %xmm2 -# CHECK-NEXT: - - 0.33 0.33 - - - 2.33 - - hsubps %xmm0, %xmm2 -# CHECK-NEXT: - - 0.33 0.33 0.50 0.50 - 2.33 - - hsubps (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - haddpd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - haddpd (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - haddps %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - haddps (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - hsubpd %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - hsubpd (%rax), %xmm2 +# CHECK-NEXT: - - 0.50 0.50 - - - 2.00 - - hsubps %xmm0, %xmm2 +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - 2.00 - - hsubps (%rax), %xmm2 # CHECK-NEXT: - - - - 0.50 0.50 - - - - lddqu (%rax), %xmm2 # CHECK-NEXT: - - 0.25 0.25 - - - 0.25 0.25 - monitor # CHECK-NEXT: - - - - - - - 1.00 - - movddup %xmm0, %xmm2 diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll index f1262c5..f79f358 100644 --- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll +++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/subtarget.ll @@ -28,6 +28,11 @@ define amdgpu_kernel void @test_kernel() { ; RUN: llvm-objdump -D -mllvm --amdhsa-code-object-version=6 %t.o > %t-detect.txt ; RUN: diff %t-specify.txt %t-detect.txt +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1152 -filetype=obj -O0 -o %t.o %s +; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1152 %t.o > %t-specify.txt +; RUN: llvm-objdump -D %t.o > %t-detect.txt +; RUN: diff %t-specify.txt %t-detect.txt + ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1151 -filetype=obj -O0 -o %t.o %s ; RUN: llvm-objdump -D --arch-name=amdgcn --mcpu=gfx1151 %t.o > %t-specify.txt ; RUN: llvm-objdump -D %t.o > %t-detect.txt diff --git a/llvm/test/tools/llvm-rc/windres-prefix.test b/llvm/test/tools/llvm-rc/windres-prefix.test index 4c53fdf..7dda51d 100644 --- a/llvm/test/tools/llvm-rc/windres-prefix.test +++ b/llvm/test/tools/llvm-rc/windres-prefix.test @@ -1,5 +1,3 @@ -; REQUIRES: shell - ; RUN: rm -rf %t && mkdir %t ; Check that a triple prefix on the executable gets picked up as target triple. diff --git a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test index f9e90e2..50d437b 100644 --- a/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test +++ b/llvm/test/tools/llvm-readobj/ELF/AMDGPU/elf-headers.test @@ -421,6 +421,15 @@ # RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1151 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1151 -DFLAG_VALUE=0x4A +# RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 -DFLAG_VALUE=0x55 + +# RUN: yaml2obj %s -o %t -DABI_VERSION=1 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=1 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 -DFLAG_VALUE=0x55 + +# RUN: yaml2obj %s -o %t -DABI_VERSION=2 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 +# RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=2 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1152 -DFLAG_VALUE=0x55 + # RUN: yaml2obj %s -o %t -DABI_VERSION=0 -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1200 # RUN: llvm-readobj -h %t | FileCheck %s --check-prefixes=ALL,KNOWN-ABI-VERSION,SINGLE-FLAG --match-full-lines -DABI_VERSION=0 -DFILE=%t -DFLAG_NAME=EF_AMDGPU_MACH_AMDGCN_GFX1200 -DFLAG_VALUE=0x48 diff --git a/llvm/test/tools/llvm-reduce/remove-debug-info-nodes.ll b/llvm/test/tools/llvm-reduce/remove-debug-info-nodes.ll index 1ceeca8..127543c 100644 --- a/llvm/test/tools/llvm-reduce/remove-debug-info-nodes.ll +++ b/llvm/test/tools/llvm-reduce/remove-debug-info-nodes.ll @@ -2,7 +2,7 @@ ; DICompileUnit and DISuprogram. ; ; RUN: llvm-reduce --delta-passes=di-metadata --abort-on-invalid-reduction --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t -; RUN: FileCheck <%t --enable-var-scope %s +; RUN: FileCheck <%t --enable-var-scope %s --implicit-check-not=DIGlobalVariableExpression ; CHECK-INTERESTINGNESS: define void @test() !dbg [[SUBPROG:![0-9]+]] ; CHECK-INTERESTINGNESS: !llvm.module.flags = !{ @@ -21,12 +21,10 @@ ; CHECK: !llvm.dbg.cu = !{[[CU:.+]]} -; CHECK-DAG: [[CU]] = distinct !DICompileUnit(language: DW_LANG_C99,{{.*}}, retainedTypes: [[TYPES:![0-9]+]], globals: [[GLOBALS:![0-9]+]] -; CHECK-DAG: [[EMPTY:![0-9]+]] = !{} +; CHECK-DAG: [[CU]] = distinct !DICompileUnit(language: DW_LANG_C99,{{.*}}, retainedTypes: [[TYPES:![0-9]+]], globals: [[EMPTY:![0-9]+]] +; CHECK-DAG: [[EMPTY]] = !{} ; CHECK-DAG: [[TYPES]] = !{[[T0:![0-9]+]] ; CHECK-DAG: [[T0]] = !DIBasicType(name: "unsigned int", -; CHECK-DAG: [[GLOBALS]] = !{{{![0-9]+}} - ; CHECK-DAG: [[SUBPROG]] = distinct !DISubprogram(name: "test", {{.*}}retainedNodes: [[EMPTY]]) define void @test() !dbg !17 { diff --git a/llvm/test/tools/split-file/output-is-special.test b/llvm/test/tools/split-file/output-is-special.test index 98bb4d3..0b1e0f7 100644 --- a/llvm/test/tools/split-file/output-is-special.test +++ b/llvm/test/tools/split-file/output-is-special.test @@ -1,5 +1,4 @@ # UNSUPPORTED: system-windows -# REQUIRES: shell ## Don't delete the output if it is special, otherwise root may accidentally ## remove important special files. diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 40494da..c696934 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1631,6 +1631,7 @@ const EnumEntry<unsigned> ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1103, "gfx1103"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1150, "gfx1150"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1151, "gfx1151"), \ + ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1152, "gfx1152"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1200, "gfx1200"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1201, "gfx1201"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX9_GENERIC, "gfx9-generic"), \ diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp index f4d8496..38352d634 100644 --- a/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp +++ b/llvm/tools/llvm-reduce/deltas/ReduceDIMetadata.cpp @@ -65,12 +65,13 @@ void identifyUninterestingMDNodes(Oracle &O, MDNodeList &MDs) { SmallVector<Metadata *, 16> TN; for (size_t I = 0; I < Tup->getNumOperands(); ++I) { // Ignore any operands that are not DebugInfo metadata nodes. - if (isa_and_nonnull<DINode>(Tup->getOperand(I))) - // Don't add uninteresting operands to the tuple. - if (!O.shouldKeep()) - continue; - - TN.push_back(Tup->getOperand(I)); + if (Metadata *Op = Tup->getOperand(I).get()) { + if (isa<DINode>(Op) || isa<DIGlobalVariableExpression>(Op)) + // Don't add uninteresting operands to the tuple. + if (!O.shouldKeep()) + continue; + TN.push_back(Op); + } } if (TN.size() != Tup->getNumOperands()) DbgNode->replaceOperandWith(OpIdx, DbgNode->get(DbgNode->getContext(), TN)); diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp index 7148e29..ca50187 100644 --- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp +++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp @@ -63,6 +63,11 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) { "riscv64"), "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); + // Check that LoongArch64 upgrades -n64 to -n32:64. + EXPECT_EQ(UpgradeDataLayoutString("e-m:e-p:64:64-i64:64-i128:128-n64-S128", + "loongarch64"), + "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"); + // Check that SPIR && SPIRV targets add -G1 if it's not present. EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir"), "e-p:32:32-G1"); EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "spir64"), "e-p:32:32-G1"); diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp index e9fd967..9e9b4fb 100644 --- a/llvm/unittests/Support/VirtualFileSystemTest.cpp +++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp @@ -1138,6 +1138,11 @@ TEST_F(InMemoryFileSystemTest, DuplicatedFile) { ASSERT_FALSE(FS.addFile("/a/b", 0, MemoryBuffer::getMemBuffer("a"))); ASSERT_TRUE(FS.addFile("/a", 0, MemoryBuffer::getMemBuffer("a"))); ASSERT_FALSE(FS.addFile("/a", 0, MemoryBuffer::getMemBuffer("b"))); + ASSERT_TRUE(FS.addFile("/b/c/d", 0, MemoryBuffer::getMemBuffer("a"))); + ASSERT_FALSE(FS.addFile("/b/c", 0, MemoryBuffer::getMemBuffer("a"))); + ASSERT_TRUE(FS.addFile( + "/b/c", 0, MemoryBuffer::getMemBuffer(""), /*User=*/std::nullopt, + /*Group=*/std::nullopt, sys::fs::file_type::directory_file)); } TEST_F(InMemoryFileSystemTest, DirectoryIteration) { diff --git a/llvm/unittests/TargetParser/Host.cpp b/llvm/unittests/TargetParser/Host.cpp index 6aa1d7a..61921a9 100644 --- a/llvm/unittests/TargetParser/Host.cpp +++ b/llvm/unittests/TargetParser/Host.cpp @@ -125,6 +125,9 @@ TEST(getLinuxHostCPUName, AArch64) { EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0xc0\n" "CPU part : 0xac5"), "ampere1b"); + EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x51\n" + "CPU part : 0x001"), + "oryon-1"); // MSM8992/4 weirdness StringRef MSM8992ProcCpuInfo = R"( diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 797d7df..571031d 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1815,11 +1815,23 @@ INSTANTIATE_TEST_SUITE_P( {AArch64::AEK_CRC, AArch64::AEK_AES, AArch64::AEK_SHA2, AArch64::AEK_FP, AArch64::AEK_SIMD, AArch64::AEK_FP16, AArch64::AEK_RAS, AArch64::AEK_LSE, AArch64::AEK_RDM}), - "8.2-A")), + "8.2-A"), + ARMCPUTestParams<AArch64::ExtensionBitset>( + "oryon-1", "armv8.6-a", "crypto-neon-fp-armv8", + (AArch64::ExtensionBitset( + {AArch64::AEK_CRC, AArch64::AEK_FP, AArch64::AEK_PAUTH, + AArch64::AEK_FCMA, AArch64::AEK_JSCVT, AArch64::AEK_SIMD, + AArch64::AEK_RAS, AArch64::AEK_LSE, AArch64::AEK_RDM, + AArch64::AEK_RCPC, AArch64::AEK_DOTPROD, AArch64::AEK_SM4, + AArch64::AEK_SHA3, AArch64::AEK_BF16, AArch64::AEK_SHA2, + AArch64::AEK_AES, AArch64::AEK_I8MM, AArch64::AEK_RAND, + AArch64::AEK_PROFILE, AArch64::AEK_CRYPTO})), + "8.6-A")), + ARMCPUTestParams<AArch64::ExtensionBitset>::PrintToStringParamName); // Note: number of CPUs includes aliases. -static constexpr unsigned NumAArch64CPUArchs = 76; +static constexpr unsigned NumAArch64CPUArchs = 77; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector<StringRef, NumAArch64CPUArchs> List; diff --git a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn index 210dd12..e88df02 100644 --- a/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/lib/Core/BUILD.gn @@ -29,6 +29,7 @@ static_library("Core") { "DynoStats.cpp", "Exceptions.cpp", "FunctionLayout.cpp", + "GDBIndex.cpp", "HashUtilities.cpp", "JumpTable.cpp", "MCPlusBuilder.cpp", diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn index 0cf9925..d3a3ee75 100644 --- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn +++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn @@ -110,6 +110,7 @@ static_library("AST") { "Interp/InterpShared.cpp", "Interp/InterpStack.cpp", "Interp/InterpState.cpp", + "Interp/MemberPointer.cpp", "Interp/Pointer.cpp", "Interp/PrimType.cpp", "Interp/Program.cpp", diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn index 0d134c7..bcf2ea7 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn @@ -33,6 +33,7 @@ static_library("IPO") { "DeadArgumentElimination.cpp", "ElimAvailExtern.cpp", "EmbedBitcodePass.cpp", + "ExpandVariadics.cpp", "ExtractGV.cpp", "ForceFunctionAttrs.cpp", "FunctionAttrs.cpp", diff --git a/llvm/utils/lit/lit/llvm/config.py b/llvm/utils/lit/lit/llvm/config.py index 1d4babc..afb7f07 100644 --- a/llvm/utils/lit/lit/llvm/config.py +++ b/llvm/utils/lit/lit/llvm/config.py @@ -588,7 +588,10 @@ class LLVMConfig(object): if getattr(self.config, pp, None) ] - self.with_environment("LD_LIBRARY_PATH", lib_paths, append_path=True) + if platform.system() == "AIX": + self.with_environment("LIBPATH", lib_paths, append_path=True) + else: + self.with_environment("LD_LIBRARY_PATH", lib_paths, append_path=True) shl = getattr(self.config, "llvm_shlib_dir", None) pext = getattr(self.config, "llvm_plugin_ext", None) diff --git a/mlir/include/mlir-c/IR.h b/mlir/include/mlir-c/IR.h index 32abacf..e3d69b7 100644 --- a/mlir/include/mlir-c/IR.h +++ b/mlir/include/mlir-c/IR.h @@ -858,6 +858,9 @@ MLIR_CAPI_EXPORTED MlirValue mlirBlockAddArgument(MlirBlock block, MlirType type, MlirLocation loc); +/// Erase the argument at 'index' and remove it from the argument list. +MLIR_CAPI_EXPORTED void mlirBlockEraseArgument(MlirBlock block, unsigned index); + /// Inserts an argument of the specified type at a specified index to the block. /// Returns the newly added argument. MLIR_CAPI_EXPORTED MlirValue mlirBlockInsertArgument(MlirBlock block, diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h index 123ce36..852490c 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h +++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h @@ -33,36 +33,36 @@ class LLVMFuncOp; /// external C function calls. The list of functions provided here must be /// implemented separately (e.g. as part of a support runtime library or as part /// of the libc). -LLVM::LLVMFuncOp lookupOrCreatePrintI64Fn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintU64Fn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintF16Fn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintBF16Fn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintF32Fn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintF64Fn(ModuleOp moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintI64Fn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintU64Fn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintF16Fn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintBF16Fn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintF32Fn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintF64Fn(Operation *moduleOp); /// Declares a function to print a C-string. /// If a custom runtime function is defined via `runtimeFunctionName`, it must /// have the signature void(char const*). The default function is `printString`. LLVM::LLVMFuncOp -lookupOrCreatePrintStringFn(ModuleOp moduleOp, +lookupOrCreatePrintStringFn(Operation *moduleOp, std::optional<StringRef> runtimeFunctionName = {}); -LLVM::LLVMFuncOp lookupOrCreatePrintOpenFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintCloseFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintCommaFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType); -LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp lookupOrCreatePrintOpenFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintCloseFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintCommaFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreateMallocFn(Operation *moduleOp, Type indexType); +LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(Operation *moduleOp, Type indexType); -LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp lookupOrCreateFreeFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(Operation *moduleOp, Type indexType); -LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(Operation *moduleOp, Type indexType); -LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp); -LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(ModuleOp moduleOp, Type indexType, +LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(Operation *moduleOp); +LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(Operation *moduleOp, Type indexType, Type unrankedDescriptorType); /// Create a FuncOp with signature `resultType`(`paramTypes`)` and name `name`. -LLVM::LLVMFuncOp lookupOrCreateFn(ModuleOp moduleOp, StringRef name, +LLVM::LLVMFuncOp lookupOrCreateFn(Operation *moduleOp, StringRef name, ArrayRef<Type> paramTypes = {}, Type resultType = {}, bool isVarArg = false); diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td index fea5afa..81bab1b 100644 --- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td +++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td @@ -114,12 +114,14 @@ def ApplyReassociativeReshapeFoldingPatternsOp : Op<Transform_Dialect, def ApplyRewriteTensorOpsAsConstantPatternsOp : Op<Transform_Dialect, "apply_patterns.tensor.rewrite_as_constant", [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>]> { + let arguments = (ins UnitAttr:$aggressive); let description = [{ Indicates that tensor ops (such as tensor.generate) should be replaced with constants (arith.constant) when possible. }]; - let assemblyFormat = "attr-dict"; + let assemblyFormat = + "(`aggressive` $aggressive^)? attr-dict"; } def Transform_TensorPadOp : Transform_ConcreteOpType<"tensor.pad">; diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h index 7dabc26..7f983b8 100644 --- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h @@ -91,9 +91,12 @@ void populateSimplifyPackAndUnpackPatterns(RewritePatternSet &patterns); /// respectively. void populateFoldIntoPackAndUnpackPatterns(RewritePatternSet &patterns); +using ControlFoldFn = std::function<bool(OpOperand *)>; + /// Populates `patterns` with patterns that replace tensor ops (such as /// tensor.generate) with constants when possible. -void populateRewriteAsConstantPatterns(RewritePatternSet &patterns); +void populateRewriteAsConstantPatterns(RewritePatternSet &patterns, + const ControlFoldFn &controlFn); //===----------------------------------------------------------------------===// // Transform helpers diff --git a/mlir/include/mlir/Target/LLVMIR/Export.h b/mlir/include/mlir/Target/LLVMIR/Export.h index 2244968..893aaaa 100644 --- a/mlir/include/mlir/Target/LLVMIR/Export.h +++ b/mlir/include/mlir/Target/LLVMIR/Export.h @@ -20,10 +20,11 @@ class Module; namespace mlir { class Operation; -/// Translate operation that satisfies LLVM dialect module requirements into an -/// LLVM IR module living in the given context. This translates operations from -/// any dilalect that has a registered implementation of -/// LLVMTranslationDialectInterface. +/// Translates a given LLVM dialect `module` into an LLVM IR module living in +/// the given context. Operates on any operation from dialects that provide a +/// registered implementation of the LLVMTranslationDialectInterface. Returns +/// nullptr when the translation fails. +/// Verifies the produced LLVM module, except when `disableVerification` is set. std::unique_ptr<llvm::Module> translateModuleToLLVMIR(Operation *module, llvm::LLVMContext &llvmContext, llvm::StringRef name = "LLVMDialectModule", diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp index de20632..4b6b54d 100644 --- a/mlir/lib/Bindings/Python/IRCore.cpp +++ b/mlir/lib/Bindings/Python/IRCore.cpp @@ -3238,6 +3238,19 @@ void mlir::python::populateIRCore(py::module &m) { return PyBlockArgumentList(self.getParentOperation(), self.get()); }, "Returns a list of block arguments.") + .def( + "add_argument", + [](PyBlock &self, const PyType &type, const PyLocation &loc) { + return mlirBlockAddArgument(self.get(), type, loc); + }, + "Append an argument of the specified type to the block and returns " + "the newly added argument.") + .def( + "erase_argument", + [](PyBlock &self, unsigned index) { + return mlirBlockEraseArgument(self.get(), index); + }, + "Erase the argument at 'index' and remove it from the argument list.") .def_property_readonly( "operations", [](PyBlock &self) { diff --git a/mlir/lib/CAPI/IR/IR.cpp b/mlir/lib/CAPI/IR/IR.cpp index a72cd24..4e823c8 100644 --- a/mlir/lib/CAPI/IR/IR.cpp +++ b/mlir/lib/CAPI/IR/IR.cpp @@ -906,6 +906,10 @@ MlirValue mlirBlockAddArgument(MlirBlock block, MlirType type, return wrap(unwrap(block)->addArgument(unwrap(type), unwrap(loc))); } +void mlirBlockEraseArgument(MlirBlock block, unsigned index) { + return unwrap(block)->eraseArgument(index); +} + MlirValue mlirBlockInsertArgument(MlirBlock block, intptr_t pos, MlirType type, MlirLocation loc) { return wrap(unwrap(block)->insertArgument(pos, unwrap(type), unwrap(loc))); diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp index b29abc9..e48ca51 100644 --- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp +++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp @@ -10,18 +10,14 @@ #include "mlir/Analysis/DataLayoutAnalysis.h" #include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/SymbolTable.h" using namespace mlir; namespace { -// TODO: Fix the LLVM utilities for looking up functions to take Operation* -// with SymbolTable trait instead of ModuleOp and make similar change here. This -// allows call sites to use getParentWithTrait<OpTrait::SymbolTable> instead -// of getParentOfType<ModuleOp> to pass down the operation. LLVM::LLVMFuncOp getNotalignedAllocFn(const LLVMTypeConverter *typeConverter, - ModuleOp module, Type indexType) { + Operation *module, Type indexType) { bool useGenericFn = typeConverter->getOptions().useGenericFunctions; - if (useGenericFn) return LLVM::lookupOrCreateGenericAllocFn(module, indexType); @@ -29,7 +25,7 @@ LLVM::LLVMFuncOp getNotalignedAllocFn(const LLVMTypeConverter *typeConverter, } LLVM::LLVMFuncOp getAlignedAllocFn(const LLVMTypeConverter *typeConverter, - ModuleOp module, Type indexType) { + Operation *module, Type indexType) { bool useGenericFn = typeConverter->getOptions().useGenericFunctions; if (useGenericFn) @@ -79,7 +75,8 @@ std::tuple<Value, Value> AllocationOpLLVMLowering::allocateBufferManuallyAlign( // Allocate the underlying buffer. Type elementPtrType = this->getElementPtrType(memRefType); LLVM::LLVMFuncOp allocFuncOp = getNotalignedAllocFn( - getTypeConverter(), op->getParentOfType<ModuleOp>(), getIndexType()); + getTypeConverter(), op->getParentWithTrait<OpTrait::SymbolTable>(), + getIndexType()); auto results = rewriter.create<LLVM::CallOp>(loc, allocFuncOp, sizeBytes); Value allocatedPtr = @@ -144,7 +141,8 @@ Value AllocationOpLLVMLowering::allocateBufferAutoAlign( Type elementPtrType = this->getElementPtrType(memRefType); LLVM::LLVMFuncOp allocFuncOp = getAlignedAllocFn( - getTypeConverter(), op->getParentOfType<ModuleOp>(), getIndexType()); + getTypeConverter(), op->getParentWithTrait<OpTrait::SymbolTable>(), + getIndexType()); auto results = rewriter.create<LLVM::CallOp>( loc, allocFuncOp, ValueRange({allocAlignment, sizeBytes})); diff --git a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp index 0004c2e..88421a1 100644 --- a/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/FunctionCallUtils.cpp @@ -45,49 +45,53 @@ static constexpr llvm::StringRef kGenericFree = "_mlir_memref_to_llvm_free"; static constexpr llvm::StringRef kMemRefCopy = "memrefCopy"; /// Generic print function lookupOrCreate helper. -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFn(ModuleOp moduleOp, StringRef name, +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFn(Operation *moduleOp, + StringRef name, ArrayRef<Type> paramTypes, Type resultType, bool isVarArg) { - auto func = moduleOp.lookupSymbol<LLVM::LLVMFuncOp>(name); + assert(moduleOp->hasTrait<OpTrait::SymbolTable>() && + "expected SymbolTable operation"); + auto func = llvm::dyn_cast_or_null<LLVM::LLVMFuncOp>( + SymbolTable::lookupSymbolIn(moduleOp, name)); if (func) return func; - OpBuilder b(moduleOp.getBodyRegion()); + OpBuilder b(moduleOp->getRegion(0)); return b.create<LLVM::LLVMFuncOp>( moduleOp->getLoc(), name, LLVM::LLVMFunctionType::get(resultType, paramTypes, isVarArg)); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintI64Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintI64Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintI64, IntegerType::get(moduleOp->getContext(), 64), LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintU64Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintU64Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintU64, IntegerType::get(moduleOp->getContext(), 64), LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF16Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF16Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintF16, IntegerType::get(moduleOp->getContext(), 16), // bits! LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintBF16Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintBF16Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintBF16, IntegerType::get(moduleOp->getContext(), 16), // bits! LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF32Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF32Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintF32, Float32Type::get(moduleOp->getContext()), LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF64Fn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintF64Fn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintF64, Float64Type::get(moduleOp->getContext()), LLVM::LLVMVoidType::get(moduleOp->getContext())); @@ -103,72 +107,72 @@ static LLVM::LLVMPointerType getVoidPtr(MLIRContext *context) { } LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintStringFn( - ModuleOp moduleOp, std::optional<StringRef> runtimeFunctionName) { + Operation *moduleOp, std::optional<StringRef> runtimeFunctionName) { return lookupOrCreateFn(moduleOp, runtimeFunctionName.value_or(kPrintString), getCharPtr(moduleOp->getContext()), LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintOpenFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintOpenFn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintOpen, {}, LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCloseFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCloseFn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintClose, {}, LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCommaFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintCommaFn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintComma, {}, LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintNewlineFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreatePrintNewlineFn(Operation *moduleOp) { return lookupOrCreateFn(moduleOp, kPrintNewline, {}, LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateMallocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateMallocFn(Operation *moduleOp, Type indexType) { return LLVM::lookupOrCreateFn(moduleOp, kMalloc, indexType, getVoidPtr(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateAlignedAllocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateAlignedAllocFn(Operation *moduleOp, Type indexType) { return LLVM::lookupOrCreateFn(moduleOp, kAlignedAlloc, {indexType, indexType}, getVoidPtr(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFreeFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateFreeFn(Operation *moduleOp) { return LLVM::lookupOrCreateFn( moduleOp, kFree, getVoidPtr(moduleOp->getContext()), LLVM::LLVMVoidType::get(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericAllocFn(ModuleOp moduleOp, +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericAllocFn(Operation *moduleOp, Type indexType) { return LLVM::lookupOrCreateFn(moduleOp, kGenericAlloc, indexType, getVoidPtr(moduleOp->getContext())); } LLVM::LLVMFuncOp -mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp, +mlir::LLVM::lookupOrCreateGenericAlignedAllocFn(Operation *moduleOp, Type indexType) { return LLVM::lookupOrCreateFn(moduleOp, kGenericAlignedAlloc, {indexType, indexType}, getVoidPtr(moduleOp->getContext())); } -LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericFreeFn(ModuleOp moduleOp) { +LLVM::LLVMFuncOp mlir::LLVM::lookupOrCreateGenericFreeFn(Operation *moduleOp) { return LLVM::lookupOrCreateFn( moduleOp, kGenericFree, getVoidPtr(moduleOp->getContext()), LLVM::LLVMVoidType::get(moduleOp->getContext())); } LLVM::LLVMFuncOp -mlir::LLVM::lookupOrCreateMemRefCopyFn(ModuleOp moduleOp, Type indexType, +mlir::LLVM::lookupOrCreateMemRefCopyFn(Operation *moduleOp, Type indexType, Type unrankedDescriptorType) { return LLVM::lookupOrCreateFn( moduleOp, kMemRefCopy, diff --git a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp index 5c6a32c..33016f8 100644 --- a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp +++ b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp @@ -127,7 +127,20 @@ void transform::ApplyReassociativeReshapeFoldingPatternsOp::populatePatterns( void transform::ApplyRewriteTensorOpsAsConstantPatternsOp::populatePatterns( RewritePatternSet &patterns) { - tensor::populateRewriteAsConstantPatterns(patterns); + ControlFoldFn defaultControlFn = [](OpOperand *fusedOperand) { + Operation *producer = fusedOperand->get().getDefiningOp(); + return producer && producer->hasOneUse(); + }; + + ControlFoldFn aggressiveControlFn = [](OpOperand *fusedOperand) { + return true; + }; + + // Add folding with reshape by expansion patterns. + if (getAggressive()) + tensor::populateRewriteAsConstantPatterns(patterns, aggressiveControlFn); + else + tensor::populateRewriteAsConstantPatterns(patterns, defaultControlFn); } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp index 5d6e3ec..c681cad 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/PackAndUnpackPatterns.cpp @@ -48,6 +48,34 @@ static LogicalResult isPackOn1D(RewriterBase &rewriter, Operation *op, return success(); } +// If the `linalgOp` represents a transpose, return the permutation vector for +// the transpose. Otherwise, return failure. +static FailureOr<SmallVector<int64_t>> +getTransposeOpPermutation(linalg::LinalgOp linalgOp) { + if (auto transposeOp = dyn_cast<linalg::TransposeOp>(linalgOp.getOperation())) + return SmallVector<int64_t>(transposeOp.getPermutation()); + if (linalgOp.getNumParallelLoops() != linalgOp.getNumLoops()) + return failure(); + + if (linalgOp.getNumDpsInputs() != 1 || linalgOp.getNumDpsInits() != 1) + return failure(); + auto mapRange = linalgOp.getIndexingMapsArray(); + if (!mapRange.front().isPermutation() || !mapRange.back().isPermutation() || + mapRange.front() == mapRange.back()) { + return failure(); + } + if (!llvm::hasSingleElement(linalgOp.getBlock()->getOperations())) + return failure(); + AffineMap outMap = mapRange.back(); + AffineMap inMap = mapRange.front(); + // To get the permutation, look at each output index and find which + // dimension in the input we're reading from for that index. + return llvm::map_to_vector(outMap.getResults(), + [&](AffineExpr expr) -> int64_t { + return *inMap.getResultPosition(expr); + }); +} + /// Packing one-dimensional tensor can be expressed as an expand shape op. struct SimplifyPackToExpandShape : public OpRewritePattern<PackOp> { using OpRewritePattern<PackOp>::OpRewritePattern; @@ -246,14 +274,10 @@ static bool checkAndPermute(ArrayRef<int64_t> permutation, for (unsigned int i = 0; i < rank; ++i) { int64_t remappedPosition = permutation[i]; - - if (!inVec.empty()) { - if (remappedPosition >= rank) { - return false; - } + if (remappedPosition >= rank) + return false; + if (!inVec.empty()) remappedPosition = inVec[remappedPosition]; - } - resVec.push_back(remappedPosition); } @@ -263,20 +287,25 @@ static bool checkAndPermute(ArrayRef<int64_t> permutation, /// Fold 'pack' -> 'transpose' into 'pack' since 'pack' already has transpose /// semantics. struct FoldProducerPackWithConsumerLinalgTransposeOp - : public OpRewritePattern<linalg::TransposeOp> { - using OpRewritePattern<linalg::TransposeOp>::OpRewritePattern; + : public OpInterfaceRewritePattern<linalg::LinalgOp> { + using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern; - LogicalResult matchAndRewrite(linalg::TransposeOp transposeOp, + LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp, PatternRewriter &rewriter) const override { - auto packOp = transposeOp.getOperand(0).getDefiningOp<PackOp>(); + auto packOp = linalgOp->getOperand(0).getDefiningOp<PackOp>(); if (!packOp) return failure(); + FailureOr<SmallVector<int64_t>> maybePerm = + getTransposeOpPermutation(linalgOp); + if (failed(maybePerm)) + return failure(); + auto innerDimsPos = packOp.getInnerDimsPos(); auto mixedInnerTiles = packOp.getMixedTiles(); auto outerDimsPerm = packOp.getOuterDimsPerm(); - auto transposePerm = transposeOp.getPermutation(); + auto transposePerm = maybePerm.value(); SmallVector<int64_t> newOuterDimsPermVec; SmallVector<int64_t> newInnerDimsPosVec; SmallVector<OpFoldResult> newMixedInnerTilesVec; @@ -285,7 +314,7 @@ struct FoldProducerPackWithConsumerLinalgTransposeOp if (!checkAndPermute(transposePerm, outerDimsPerm, newOuterDimsPermVec, srcRank)) return rewriter.notifyMatchFailure( - transposeOp, + linalgOp, "Cannot fold in tensor.pack if a tile dimension was transposed " "with a non-tile dimension in linalg.transpose."); @@ -297,11 +326,11 @@ struct FoldProducerPackWithConsumerLinalgTransposeOp } Value output = packOp.createDestinationTensor( - rewriter, transposeOp.getLoc(), packOp.getSource(), - newMixedInnerTilesVec, newInnerDimsPosVec, newOuterDimsPermVec); + rewriter, linalgOp.getLoc(), packOp.getSource(), newMixedInnerTilesVec, + newInnerDimsPosVec, newOuterDimsPermVec); rewriter.replaceOpWithNewOp<PackOp>( - transposeOp, packOp.getSource(), output, newInnerDimsPosVec, + linalgOp, packOp.getSource(), output, newInnerDimsPosVec, newMixedInnerTilesVec, packOp.getPaddingValue(), newOuterDimsPermVec); return success(); @@ -316,12 +345,16 @@ struct FoldConsumerPackWithProducerLinalgTransposeOp LogicalResult matchAndRewrite(PackOp packOp, PatternRewriter &rewriter) const override { - auto transposeOp = packOp.getSource().getDefiningOp<linalg::TransposeOp>(); + auto linalgOp = packOp.getSource().getDefiningOp<linalg::LinalgOp>(); + if (!linalgOp) + return failure(); - if (!transposeOp) + FailureOr<SmallVector<int64_t>> maybePerm = + getTransposeOpPermutation(linalgOp); + if (failed(maybePerm)) return failure(); - auto transposePermutation = transposeOp.getPermutation(); + auto transposePermutation = maybePerm.value(); auto outerDimsPerm = packOp.getOuterDimsPerm(); auto innerDimsPos = packOp.getInnerDimsPos(); SmallVector<int64_t> newInnerDimsPosVec; @@ -337,11 +370,11 @@ struct FoldConsumerPackWithProducerLinalgTransposeOp newInnerDimsPosVec.push_back(transposePermutation[dim]); Value output = packOp.createDestinationTensor( - rewriter, packOp.getLoc(), transposeOp.getOperand(0), + rewriter, packOp.getLoc(), linalgOp->getOperand(0), packOp.getMixedTiles(), newInnerDimsPosVec, newOuterDimsPermVec); rewriter.replaceOpWithNewOp<PackOp>( - packOp, transposeOp.getOperand(0), output, newInnerDimsPosVec, + packOp, linalgOp->getOperand(0), output, newInnerDimsPosVec, packOp.getMixedTiles(), packOp.getPaddingValue(), newOuterDimsPermVec); return success(); @@ -351,34 +384,38 @@ struct FoldConsumerPackWithProducerLinalgTransposeOp /// Fold 'unpack' -> 'transpose' into 'unpack' since 'unpack' already has /// transpose semantics. struct FoldProducerUnPackWithConsumerLinalgTransposeOp - : public OpRewritePattern<linalg::TransposeOp> { - using OpRewritePattern<linalg::TransposeOp>::OpRewritePattern; + : public OpInterfaceRewritePattern<linalg::LinalgOp> { + using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern; - LogicalResult matchAndRewrite(linalg::TransposeOp transposeOp, + LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp, PatternRewriter &rewriter) const override { - auto unPackOp = transposeOp.getOperand(0).getDefiningOp<UnPackOp>(); + auto unPackOp = linalgOp->getOperand(0).getDefiningOp<UnPackOp>(); if (!unPackOp) return failure(); - auto transposePermutation = transposeOp.getPermutation(); + FailureOr<SmallVector<int64_t>> maybePerm = + getTransposeOpPermutation(linalgOp); + if (failed(maybePerm)) + return failure(); + auto outerDimsPerm = unPackOp.getOuterDimsPerm(); auto innerDimsPos = unPackOp.getInnerDimsPos(); SmallVector<int64_t> newInnerDimsPosVec; SmallVector<int64_t> newOuterDimsPermVec = - llvm::to_vector(transposePermutation); - - if (!outerDimsPerm.empty()) - applyPermutationToVector(newOuterDimsPermVec, outerDimsPerm); + invertPermutationVector(maybePerm.value()); // Can't use applyPermutationToVector for newInnerDimsPosVec since input and // permutation rank won't necessarily be equal in all cases. for (auto dim : innerDimsPos) - newInnerDimsPosVec.push_back(transposePermutation[dim]); + newInnerDimsPosVec.push_back(newOuterDimsPermVec[dim]); + + if (!outerDimsPerm.empty()) + applyPermutationToVector(newOuterDimsPermVec, outerDimsPerm); // Reuse the destination of the transpose op. rewriter.replaceOpWithNewOp<UnPackOp>( - transposeOp, unPackOp.getSource(), transposeOp.getDpsInits()[0], + linalgOp, unPackOp.getSource(), linalgOp.getDpsInits()[0], newInnerDimsPosVec, unPackOp.getMixedTiles(), newOuterDimsPermVec); return success(); @@ -393,13 +430,17 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp LogicalResult matchAndRewrite(UnPackOp unPackOp, PatternRewriter &rewriter) const override { - auto transposeOp = - unPackOp.getSource().getDefiningOp<linalg::TransposeOp>(); + auto linalgOp = unPackOp.getSource().getDefiningOp<linalg::LinalgOp>(); + if (!linalgOp) + return failure(); - if (!transposeOp) + FailureOr<SmallVector<int64_t>> maybePerm = + getTransposeOpPermutation(linalgOp); + if (failed(maybePerm)) return failure(); - auto transposePermutation = transposeOp.getPermutation(); + SmallVector<int64_t> inverseTransposePerm = + invertPermutationVector(maybePerm.value()); auto outerDimsPerm = unPackOp.getOuterDimsPerm(); auto innerDimsPos = unPackOp.getInnerDimsPos(); int64_t destRank = unPackOp.getSourceRank() - innerDimsPos.size(); @@ -408,7 +449,7 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp SmallVector<int64_t> newInnerDimsPosVec; SmallVector<OpFoldResult> newMixedInnerTilesVec; - if (!checkAndPermute(transposePermutation, outerDimsPerm, + if (!checkAndPermute(inverseTransposePerm, outerDimsPerm, newOuterDimsPermVec, destRank)) return rewriter.notifyMatchFailure( unPackOp, @@ -416,18 +457,18 @@ struct FoldConsumerUnPackWithProducerLinalgTransposeOp "with a non-tile dimension in linalg.transpose."); // Process transpose operation for tiled inner dimensions - for (unsigned int i = destRank; i < transposePermutation.size(); ++i) { - int64_t remappedPosition = transposePermutation[i] - destRank; + for (unsigned int i = destRank; i < inverseTransposePerm.size(); ++i) { + int64_t remappedPosition = inverseTransposePerm[i] - destRank; newMixedInnerTilesVec.push_back(mixedInnerTilesVec[remappedPosition]); newInnerDimsPosVec.push_back(innerDimsPos[remappedPosition]); } Value output = unPackOp.createDestinationTensor( - rewriter, unPackOp.getLoc(), transposeOp.getOperand(0), + rewriter, unPackOp.getLoc(), linalgOp->getOperand(0), newMixedInnerTilesVec, newInnerDimsPosVec, newOuterDimsPermVec); rewriter.replaceOpWithNewOp<UnPackOp>( - unPackOp, transposeOp.getOperand(0), output, newInnerDimsPosVec, + unPackOp, linalgOp->getOperand(0), output, newInnerDimsPosVec, newMixedInnerTilesVec, newOuterDimsPermVec); return success(); diff --git a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp index 11e1de5..7c9fced 100644 --- a/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/RewriteAsConstant.cpp @@ -8,9 +8,12 @@ // #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Tensor/Transforms/Transforms.h" +#include "mlir/Dialect/Utils/IndexingUtils.h" #include "mlir/IR/Matchers.h" #include "mlir/IR/PatternMatch.h" +#include "llvm/ADT/TypeSwitch.h" + using namespace mlir; using namespace mlir::tensor; @@ -45,9 +48,169 @@ struct GenerateToConstant : public OpRewritePattern<GenerateOp> { } }; +/// Transform a linear index from one indexing space to another given: +/// +/// - the shape of the source indexing space, +/// - the strides of the target indexing space, +/// - a linear index into the source indexing space. +/// +/// This function is logically a sequence of linearize/delinearize over +/// different bases but avoids allocating intermediate SmallVectors. +int64_t transformIndexSpace(ArrayRef<int64_t> inputShape, + ArrayRef<int64_t> outputStrides, + int64_t srcLinearIndex) { + assert(inputShape.size() == outputStrides.size()); + + int64_t dstLinearIndex = 0; + + for (int64_t dim = inputShape.size() - 1; dim >= 0; --dim) { + // Compute the index into the current dimension of the source tensor. + // `quotient` is the remaining linear index after accounting for the + // current dimension. + // + // `remainder` is the index into the source tensor for the current + // dimension. + auto [quotient, remainder] = std::div(srcLinearIndex, inputShape[dim]); + + srcLinearIndex = quotient; + + // Add the contribution of the current dimension to the output using the + // permutation map. + dstLinearIndex += outputStrides[dim] * remainder; + } + + return dstLinearIndex; +} + +template <typename ElemType, typename AttrType> +Value constantFoldPadOp(PatternRewriter &rewriter, Location loc, + DenseElementsAttr input, AttrType padValue, + ArrayRef<int64_t> padLow, ArrayRef<int64_t> padHigh) { + auto inputValues = input.tryGetValues<ElemType>(); + if (failed(inputValues)) + return nullptr; + + auto oldShape = input.getType().getShape(); + + // Compute the output shape of the new value. + auto newShape = + llvm::map_to_vector(llvm::zip(oldShape, padLow, padHigh), + [](std::tuple<int64_t, int64_t, int64_t> pack) { + auto [old, low, high] = pack; + return old + low + high; + }); + + int64_t outputSize = computeProduct(newShape); + + // Fully initialize the vector with the padding value. + // The non-padded area will then be copied. + SmallVector<ElemType> values(outputSize, padValue.getValue()); + + // Strides for input and output are used to transform between the indexing + // space of the input and output tensors. + SmallVector<int64_t> outputStrides = computeStrides(newShape); + + // The contribution of the low padding to the offset in the output tensor. + // This is the starting position of the source tensor within the padding + // tensor. + int64_t startingOffset = linearize(padLow, outputStrides); + + // Copy values from the input tensor to the corresponding sub-region + // of the output tensor. + for (auto [inputIndex, inputValue] : llvm::enumerate(*inputValues)) { + auto outputIndex = transformIndexSpace(oldShape, outputStrides, inputIndex); + values[outputIndex + startingOffset] = inputValue; + } + + // Create an attribute for the folded value. + auto newType = input.getType().clone(newShape); + auto newAttr = DenseElementsAttr::get(newType, values); + + Operation *constantOp = + rewriter.getContext() + ->getLoadedDialect<TensorDialect>() + ->materializeConstant(rewriter, newAttr, newType, loc); + + return constantOp ? constantOp->getResult(0) : nullptr; +} + +struct PadOpToConstant final : public OpRewritePattern<PadOp> { + + PadOpToConstant(MLIRContext *context, const ControlFoldFn &controlFn, + PatternBenefit benefit = 1) + : OpRewritePattern<PadOp>(context, benefit), controlFn{controlFn} {} + + LogicalResult matchAndRewrite(PadOp padTensorOp, + PatternRewriter &rewriter) const override { + if (padTensorOp.getNofold()) + return rewriter.notifyMatchFailure( + padTensorOp, "refusing to fold nofold pad operation"); + + TypedValue<RankedTensorType> input = padTensorOp.getSource(); + RankedTensorType resultType = padTensorOp.getResult().getType(); + + DenseElementsAttr inputAttr = nullptr; + if (!matchPattern(input, m_Constant(&inputAttr))) + return failure(); + + Value paddingValue = padTensorOp.getConstantPaddingValue(); + + // Extract the constant value used for padding or bail out. + Attribute paddingAttr = nullptr; + if (!paddingValue || !matchPattern(paddingValue, m_Constant(&paddingAttr))) + return rewriter.notifyMatchFailure(padTensorOp, + "unable to get constant value"); + + // Try to extract the constant values of the low and high padding. + auto lowPad = getConstantIntValues(padTensorOp.getMixedLowPad()); + auto highPad = getConstantIntValues(padTensorOp.getMixedHighPad()); + + // If the padding cannot be extracted, bail out. + if (!lowPad || !highPad) + return rewriter.notifyMatchFailure(padTensorOp, + "unable to extract constant padding"); + + // We have a potential candidate, consult the control function to + // determine if the op should fold. + if (!controlFn(&padTensorOp.getSourceMutable())) + return rewriter.notifyMatchFailure(padTensorOp, + "not folding due to cost function"); + + Location loc = padTensorOp.getLoc(); + + // Try constant folding the supported cases of integer and float values. + Value newOp = + llvm::TypeSwitch<Attribute, Value>(paddingAttr) + .Case([&](FloatAttr floatAttr) { + return constantFoldPadOp<llvm::APFloat>( + rewriter, loc, inputAttr, floatAttr, *lowPad, *highPad); + }) + .Case([&](IntegerAttr integerAttr) { + return constantFoldPadOp<llvm::APInt>( + rewriter, loc, inputAttr, integerAttr, *lowPad, *highPad); + }) + .Default(Value()); + + if (!newOp) + return rewriter.notifyMatchFailure(padTensorOp, + "tensor type not supported"); + + if (newOp.getType() != resultType) + newOp = rewriter.create<tensor::CastOp>(loc, resultType, newOp); + + rewriter.replaceOp(padTensorOp, newOp); + return success(); + } + +private: + ControlFoldFn controlFn; +}; + } // namespace void mlir::tensor::populateRewriteAsConstantPatterns( - RewritePatternSet &patterns) { + RewritePatternSet &patterns, const ControlFoldFn &controlFn) { patterns.add<GenerateToConstant>(patterns.getContext()); + + patterns.add<PadOpToConstant>(patterns.getContext(), controlFn); } diff --git a/mlir/lib/Dialect/Utils/IndexingUtils.cpp b/mlir/lib/Dialect/Utils/IndexingUtils.cpp index 4c96065..aba225b 100644 --- a/mlir/lib/Dialect/Utils/IndexingUtils.cpp +++ b/mlir/lib/Dialect/Utils/IndexingUtils.cpp @@ -92,7 +92,7 @@ int64_t mlir::computeProduct(ArrayRef<int64_t> basis) { assert(llvm::all_of(basis, [](int64_t s) { return s > 0; }) && "basis must be nonnegative"); if (basis.empty()) - return 0; + return 1; return std::accumulate(basis.begin(), basis.end(), 1, std::multiplies<int64_t>()); } diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp index 997b56a..c131fde 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransferOpTransforms.cpp @@ -505,25 +505,61 @@ static Value collapseInnerDims(PatternRewriter &rewriter, mlir::Location loc, return rewriter.create<memref::CollapseShapeOp>(loc, input, reassociation); } -/// Checks that the indices corresponding to dimensions starting at -/// `firstDimToCollapse` are constant 0, and writes to `outIndices` -/// the truncated indices where `firstDimToCollapse` is now the innermost dim. -/// TODO: Extract the logic that writes to outIndices so that this method -/// simply checks one pre-condition. -static LogicalResult -checkAndCollapseInnerZeroIndices(ValueRange indices, int64_t firstDimToCollapse, - SmallVector<Value> &outIndices) { - int64_t rank = indices.size(); - if (firstDimToCollapse >= rank) - return failure(); - for (int64_t i = firstDimToCollapse; i < rank; ++i) { - std::optional<int64_t> cst = getConstantIntValue(indices[i]); - if (!cst || cst.value() != 0) - return failure(); +/// Returns the new indices that collapses the inner dimensions starting from +/// the `firstDimToCollapse` dimension. +static SmallVector<Value> getCollapsedIndices(RewriterBase &rewriter, + Location loc, + ArrayRef<int64_t> shape, + ValueRange indices, + int64_t firstDimToCollapse) { + assert(firstDimToCollapse < static_cast<int64_t>(indices.size())); + + // If all the collapsed indices are zero then no extra logic is needed. + // Otherwise, a new offset/index has to be computed. + SmallVector<Value> indicesAfterCollapsing( + indices.begin(), indices.begin() + firstDimToCollapse); + SmallVector<Value> indicesToCollapse(indices.begin() + firstDimToCollapse, + indices.end()); + if (llvm::all_of(indicesToCollapse, isZeroIndex)) { + indicesAfterCollapsing.push_back(indicesToCollapse[0]); + return indicesAfterCollapsing; + } + + // Compute the remaining trailing index/offset required for reading from + // the collapsed memref: + // + // offset = 0 + // for (i = firstDimToCollapse; i < outputRank; ++i) + // offset += sourceType.getDimSize(i) * transferReadOp.indices[i] + // + // For this example: + // %2 = vector.transfer_read/write %arg4[%c0, %arg0, %c0] (...) : + // memref<1x43x2xi32>, vector<1x2xi32> + // which would be collapsed to: + // %1 = vector.transfer_read/write %collapse_shape[%c0, %offset] (...) : + // memref<1x86xi32>, vector<2xi32> + // one would get the following offset: + // %offset = %arg0 * 43 + OpFoldResult collapsedOffset = + rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult(); + + auto collapsedStrides = computeSuffixProduct( + ArrayRef<int64_t>(shape.begin() + firstDimToCollapse, shape.end())); + + // Compute the collapsed offset. + auto &&[collapsedExpr, collapsedVals] = + computeLinearIndex(collapsedOffset, collapsedStrides, indicesToCollapse); + collapsedOffset = affine::makeComposedFoldedAffineApply( + rewriter, loc, collapsedExpr, collapsedVals); + + if (collapsedOffset.is<Value>()) { + indicesAfterCollapsing.push_back(collapsedOffset.get<Value>()); + } else { + indicesAfterCollapsing.push_back(rewriter.create<arith::ConstantIndexOp>( + loc, *getConstantIntValue(collapsedOffset))); } - outIndices = indices; - outIndices.resize(firstDimToCollapse + 1); - return success(); + + return indicesAfterCollapsing; } namespace { @@ -594,54 +630,9 @@ public: AffineMap::get(collapsedRank, 0, dimExprs, rewriter.getContext()); // 2.2 New indices - // If all the collapsed indices are zero then no extra logic is needed. - // Otherwise, a new offset/index has to be computed. - SmallVector<Value> collapsedIndices; - if (failed(checkAndCollapseInnerZeroIndices(transferReadOp.getIndices(), - firstDimToCollapse, - collapsedIndices))) { - // Copy all the leading indices. - SmallVector<Value> indices = transferReadOp.getIndices(); - collapsedIndices.append(indices.begin(), - indices.begin() + firstDimToCollapse); - - // Compute the remaining trailing index/offset required for reading from - // the collapsed memref: - // - // offset = 0 - // for (i = firstDimToCollapse; i < outputRank; ++i) - // offset += sourceType.getDimSize(i) * transferReadOp.indices[i] - // - // For this example: - // %2 = vector.transfer_read %arg4[%c0, %arg0, %c0] (...) : - // memref<1x43x2xi32>, vector<1x2xi32> - // which would be collapsed to: - // %1 = vector.transfer_read %collapse_shape[%c0, %offset] (...) : - // memref<1x86xi32>, vector<2xi32> - // one would get the following offset: - // %offset = %arg0 * 43 - OpFoldResult collapsedOffset = - rewriter.create<arith::ConstantIndexOp>(loc, 0).getResult(); - - auto sourceShape = sourceType.getShape(); - auto collapsedStrides = computeSuffixProduct(ArrayRef<int64_t>( - sourceShape.begin() + firstDimToCollapse, sourceShape.end())); - - // Compute the collapsed offset. - ArrayRef<Value> indicesToCollapse(indices.begin() + firstDimToCollapse, - indices.end()); - auto &&[collapsedExpr, collapsedVals] = computeLinearIndex( - collapsedOffset, collapsedStrides, indicesToCollapse); - collapsedOffset = affine::makeComposedFoldedAffineApply( - rewriter, loc, collapsedExpr, collapsedVals); - - if (collapsedOffset.is<Value>()) { - collapsedIndices.push_back(collapsedOffset.get<Value>()); - } else { - collapsedIndices.push_back(rewriter.create<arith::ConstantIndexOp>( - loc, *getConstantIntValue(collapsedOffset))); - } - } + SmallVector<Value> collapsedIndices = + getCollapsedIndices(rewriter, loc, sourceType.getShape(), + transferReadOp.getIndices(), firstDimToCollapse); // 3. Create new vector.transfer_read that reads from the collapsed memref VectorType flatVectorType = VectorType::get({vectorType.getNumElements()}, @@ -697,8 +688,7 @@ public: return failure(); if (!vector::isContiguousSlice(sourceType, vectorType)) return failure(); - int64_t firstContiguousInnerDim = - sourceType.getRank() - vectorType.getRank(); + int64_t firstDimToCollapse = sourceType.getRank() - vectorType.getRank(); // TODO: generalize this pattern, relax the requirements here. if (transferWriteOp.hasOutOfBoundsDim()) return failure(); @@ -706,22 +696,23 @@ public: return failure(); if (transferWriteOp.getMask()) return failure(); - SmallVector<Value> collapsedIndices; - if (failed(checkAndCollapseInnerZeroIndices(transferWriteOp.getIndices(), - firstContiguousInnerDim, - collapsedIndices))) - return failure(); + + SmallVector<Value> collapsedIndices = + getCollapsedIndices(rewriter, loc, sourceType.getShape(), + transferWriteOp.getIndices(), firstDimToCollapse); Value collapsedSource = - collapseInnerDims(rewriter, loc, source, firstContiguousInnerDim); + collapseInnerDims(rewriter, loc, source, firstDimToCollapse); MemRefType collapsedSourceType = cast<MemRefType>(collapsedSource.getType()); int64_t collapsedRank = collapsedSourceType.getRank(); - assert(collapsedRank == firstContiguousInnerDim + 1); + assert(collapsedRank == firstDimToCollapse + 1); + SmallVector<AffineExpr, 1> dimExprs{ - getAffineDimExpr(firstContiguousInnerDim, rewriter.getContext())}; + getAffineDimExpr(firstDimToCollapse, rewriter.getContext())}; auto collapsedMap = AffineMap::get(collapsedRank, 0, dimExprs, rewriter.getContext()); + VectorType flatVectorType = VectorType::get({vectorType.getNumElements()}, vectorType.getElementType()); Value flatVector = diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp index 2295922..5ef47fb 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp @@ -110,7 +110,7 @@ void CreateNdDescOp::build(OpBuilder &builder, OperationState &state, dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); dispatchIndexOpFoldResults(shape, dynamicShape, staticShape); - dispatchIndexOpFoldResults(strides, dynamicStrides, staticOffsets); + dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); auto staticOffsetsAttr = builder.getDenseI64ArrayAttr(staticOffsets); auto staticShapeAttr = builder.getDenseI64ArrayAttr(staticShape); diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index 29e3621..6a362af 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -2061,7 +2061,8 @@ void AsmPrinter::Impl::printLocationInternal(LocationAttr loc, bool pretty, /// Print a floating point value in a way that the parser will be able to /// round-trip losslessly. -static void printFloatValue(const APFloat &apValue, raw_ostream &os) { +static void printFloatValue(const APFloat &apValue, raw_ostream &os, + bool *printedHex = nullptr) { // We would like to output the FP constant value in exponential notation, // but we cannot do this if doing so will lose precision. Check here to // make sure that we only output it in exponential format if we can parse @@ -2102,6 +2103,8 @@ static void printFloatValue(const APFloat &apValue, raw_ostream &os) { // Print special values in hexadecimal format. The sign bit should be included // in the literal. + if (printedHex) + *printedHex = true; SmallVector<char, 16> str; APInt apInt = apValue.bitcastToAPInt(); apInt.toString(str, /*Radix=*/16, /*Signed=*/false, @@ -2275,10 +2278,12 @@ void AsmPrinter::Impl::printAttributeImpl(Attribute attr, return; } else if (auto floatAttr = llvm::dyn_cast<FloatAttr>(attr)) { - printFloatValue(floatAttr.getValue(), os); + bool printedHex = false; + printFloatValue(floatAttr.getValue(), os, &printedHex); // FloatAttr elides the type if F64. - if (typeElision == AttrTypeElision::May && floatAttr.getType().isF64()) + if (typeElision == AttrTypeElision::May && floatAttr.getType().isF64() && + !printedHex) return; } else if (auto strAttr = llvm::dyn_cast<StringAttr>(attr)) { diff --git a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir index 9a3143f5..629a4c2 100644 --- a/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir +++ b/mlir/test/Dialect/Tensor/fold-into-pack-and-unpack.mlir @@ -636,3 +636,142 @@ func.func @tensor_padded_unpack_linalg_transpose_fold(%arg0: tensor<71x7x4x16x16 // CHECK-SAME: into %[[OUT:.+]] : tensor<71x7x4x16x16xf32> -> tensor<100x71x64xf32> // CHECK: return %[[UNPACK]] : tensor<100x71x64xf32> // CHECK: } + +// ----- + +func.func @non_involution_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { + %0 = tensor.empty() : tensor<5x2x3x16x4xi32> + %transposed = linalg.transpose ins(%arg0 : tensor<2x3x5x4x16xi32>) + outs(%0 : tensor<5x2x3x16x4xi32>) + permutation = [2, 0, 1, 4, 3] + %1 = tensor.empty() : tensor<5x48x8xi32> + %unpack = tensor.unpack %transposed + outer_dims_perm = [0, 2, 1] + inner_dims_pos = [1, 2] + inner_tiles = [16, 4] into + %1 : tensor<5x2x3x16x4xi32> -> tensor<5x48x8xi32> + return %unpack : tensor<5x48x8xi32> +} +//CHECK-LABEL: func.func @non_involution_transpose_unpack_fold( +// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { +// CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK-SAME: outer_dims_perm = [2, 1, 0] +// CHECK-SAME: inner_dims_pos = [2, 1] +// CHECK-SAME: inner_tiles = [4, 16] +// CHEKC-SAME: into %[[OUT]] : tensor<2x3x5x4x16xi32> -> tensor<5x48x8xi32> +// CHECK: return %[[UNPACK]] : tensor<5x48x8xi32> +// CHECK: } + +// ----- + +func.func @unpack_non_involution_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { + %0 = tensor.empty() : tensor<3x56x3648xf32> + %unpack = tensor.unpack %arg0 + outer_dims_perm = [2, 0, 1] + inner_dims_pos = [1, 2] + inner_tiles = [1, 64] + into %0 : tensor<57x3x56x1x64xf32> -> tensor<3x56x3648xf32> + + %1 = tensor.empty() : tensor<3648x3x56xf32> + %transposed = linalg.transpose + ins(%unpack : tensor<3x56x3648xf32>) + outs(%1 : tensor<3648x3x56xf32>) + permutation = [2, 0, 1] + return %transposed : tensor<3648x3x56xf32> +} +// CHECK-LABEL: func.func @unpack_non_involution_transpose_fold( +// CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { +// CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK-SAME: outer_dims_perm = [0, 1, 2] +// CHECK-SAME: inner_dims_pos = [2, 0] +// CHECK-SAME: inner_tiles = [1, 64] +// CHECK-SAME: into %[[OUT:.+]] : tensor<57x3x56x1x64xf32> -> tensor<3648x3x56xf32> +// CHECK: return %[[UNPACK]] : tensor<3648x3x56xf32> +// CHECK: } + +// ----- + +func.func @transpose_unpacked_dims_no_fold(%arg0: tensor<2x16x5x4x3xi32>) -> tensor<5x32x12xi32> { + %0 = tensor.empty() : tensor<5x2x3x16x4xi32> + %transposed = linalg.transpose ins(%arg0 : tensor<2x16x5x4x3xi32>) + outs(%0 : tensor<5x2x3x16x4xi32>) + permutation = [2, 0, 4, 1, 3] + %1 = tensor.empty() : tensor<5x32x12xi32> + %unpack = tensor.unpack %transposed + inner_dims_pos = [1, 2] + inner_tiles = [16, 4] into + %1 : tensor<5x2x3x16x4xi32> -> tensor<5x32x12xi32> + return %unpack : tensor<5x32x12xi32> +} +//CHECK-LABEL: func.func @transpose_unpacked_dims_no_fold( +// CHECK: linalg.transpose +// CHECK: tensor.unpack + +// ----- + +#map = affine_map<(d0, d1, d2, d3, d4)->(d1, d2, d0, d4, d3)> +#map1 = affine_map<(d0, d1, d2, d3, d4)->(d0, d1, d2, d3, d4)> +func.func @generic_transpose_unpack_fold(%arg0: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { + %0 = tensor.empty() : tensor<5x2x3x16x4xi32> + %transposed = linalg.generic { + iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"], + indexing_maps = [#map, #map1]} + ins(%arg0 : tensor<2x3x5x4x16xi32>) + outs(%0 : tensor<5x2x3x16x4xi32>) { + ^bb0(%in : i32, %out : i32): + linalg.yield %in : i32 + } -> tensor<5x2x3x16x4xi32> + %1 = tensor.empty() : tensor<5x48x8xi32> + %unpack = tensor.unpack %transposed + outer_dims_perm = [0, 2, 1] + inner_dims_pos = [1, 2] + inner_tiles = [16, 4] into + %1 : tensor<5x2x3x16x4xi32> -> tensor<5x48x8xi32> + return %unpack : tensor<5x48x8xi32> +} +//CHECK-LABEL: func.func @generic_transpose_unpack_fold( +// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x5x4x16xi32>) -> tensor<5x48x8xi32> { +// CHECK: %[[OUT:.+]] = tensor.empty() : tensor<5x48x8xi32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK-SAME: outer_dims_perm = [2, 1, 0] +// CHECK-SAME: inner_dims_pos = [2, 1] +// CHECK-SAME: inner_tiles = [4, 16] +// CHEKC-SAME: into %[[OUT]] : tensor<2x3x5x4x16xi32> -> tensor<5x48x8xi32> +// CHECK: return %[[UNPACK]] : tensor<5x48x8xi32> +// CHECK: } + +// ----- + +#map = affine_map<(d0, d1, d2)->(d1, d2, d0)> +#map1 = affine_map<(d0, d1, d2)->(d0, d1, d2)> +func.func @unpack_generic_transpose_fold(%arg0: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { + %0 = tensor.empty() : tensor<3x56x3648xf32> + %unpack = tensor.unpack %arg0 + outer_dims_perm = [2, 0, 1] + inner_dims_pos = [1, 2] + inner_tiles = [1, 64] + into %0 : tensor<57x3x56x1x64xf32> -> tensor<3x56x3648xf32> + + %1 = tensor.empty() : tensor<3648x3x56xf32> + %transposed = linalg.generic { + iterator_types = ["parallel", "parallel", "parallel"], + indexing_maps = [#map, #map1]} + ins(%unpack : tensor<3x56x3648xf32>) + outs(%1 : tensor<3648x3x56xf32>) { + ^bb0(%in : f32, %out : f32): + linalg.yield %in : f32 + } -> tensor<3648x3x56xf32> + return %transposed : tensor<3648x3x56xf32> +} +// CHECK-LABEL: func.func @unpack_generic_transpose_fold( +// CHECK-SAME: %[[ARG0:.+]]: tensor<57x3x56x1x64xf32>) -> tensor<3648x3x56xf32> { +// CHECK: %[[OUT:.+]] = tensor.empty() : tensor<3648x3x56xf32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[ARG0]] +// CHECK-SAME: outer_dims_perm = [0, 1, 2] +// CHECK-SAME: inner_dims_pos = [2, 0] +// CHECK-SAME: inner_tiles = [1, 64] +// CHECK-SAME: into %[[OUT:.+]] : tensor<57x3x56x1x64xf32> -> tensor<3648x3x56xf32> +// CHECK: return %[[UNPACK]] : tensor<3648x3x56xf32> +// CHECK: } diff --git a/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir b/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir index 1a1cf9e..35ee6f1 100644 --- a/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir +++ b/mlir/test/Dialect/Tensor/rewrite-as-constant.mlir @@ -21,3 +21,138 @@ func.func @tensor_generate_constant() -> tensor<2x3x5xf32> { } : tensor<2x3x5xf32> return %0 : tensor<2x3x5xf32> } + +// CHECK-LABEL: func @pad_of_ints( +// CHECK: %[[cst:.*]] = arith.constant dense<[ +// CHECK-SAME{LITERAL}: [0, 0, 0, 0], +// CHECK-SAME{LITERAL}: [0, 6, 7, 0], +// CHECK-SAME{LITERAL}: [0, 8, 9, 0], +// CHECK-SAME{LITERAL}: [0, 0, 0, 0] +// CHECK-SAME{LITERAL}: ]> : tensor<4x4xi32> +// CHECK: %[[cast:.*]] = tensor.cast %[[cst]] : tensor<4x4xi32> to tensor<?x?xi32> +// CHECK: return %[[cast]] +func.func @pad_of_ints() -> tensor<?x?xi32> { + %init = arith.constant dense<[[6, 7], [8, 9]]> : tensor<2x2xi32> + %pad_value = arith.constant 0 : i32 + + %c1 = arith.constant 1 : index + + %0 = tensor.pad %init low[%c1, %c1] high[%c1, %c1] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : i32 + } : tensor<2x2xi32> to tensor<?x?xi32> + + return %0 : tensor<?x?xi32> +} + +// CHECK-LABEL: func @pad_of_floats( +// CHECK: %[[cst:.*]] = arith.constant dense<[ +// CHECK-SAME{LITERAL}: [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00], +// CHECK-SAME{LITERAL}: [0.000000e+00, 6.000000e+00, 7.000000e+00, 0.000000e+00], +// CHECK-SAME{LITERAL}: [0.000000e+00, 8.000000e+00, 9.000000e+00, 0.000000e+00], +// CHECK-SAME{LITERAL}: [0.000000e+00, 0.000000e+00, 0.000000e+00, 0.000000e+00] +// CHECK-SAME{LITERAL}: ]> : tensor<4x4xf32> +// CHECK: return %[[cst]] + +func.func @pad_of_floats() -> tensor<4x4xf32> { + %init = arith.constant dense<[[6.0, 7.0], [8.0, 9.0]]> : tensor<2x2xf32> + %pad_value = arith.constant 0.0 : f32 + + %0 = tensor.pad %init low[1, 1] high[1, 1] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : f32 + } : tensor<2x2xf32> to tensor<4x4xf32> + + return %0 : tensor<4x4xf32> +} + +// CHECK-LABEL: func @pad_of_ints_no_low_dims( +// CHECK: %[[cst:.*]] = arith.constant dense<[ +// CHECK-SAME{LITERAL}: [6, 7, 0], +// CHECK-SAME{LITERAL}: [8, 9, 0], +// CHECK-SAME{LITERAL}: [0, 0, 0] +// CHECK-SAME{LITERAL}: ]> : tensor<3x3xi32> +// CHECK: return %[[cst]] +func.func @pad_of_ints_no_low_dims() -> tensor<3x3xi32> { + %init = arith.constant dense<[[6, 7], [8, 9]]> : tensor<2x2xi32> + %pad_value = arith.constant 0 : i32 + + %0 = tensor.pad %init low[0, 0] high[1, 1] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : i32 + } : tensor<2x2xi32> to tensor<3x3xi32> + + return %0 : tensor<3x3xi32> +} + +// CHECK-LABEL: func @pad_of_ints_no_high_dims( +// CHECK: %[[cst:.*]] = arith.constant dense<[ +// CHECK-SAME{LITERAL}: [0, 0, 0], +// CHECK-SAME{LITERAL}: [0, 6, 7], +// CHECK-SAME{LITERAL}: [0, 8, 9] +// CHECK-SAME{LITERAL}: ]> : tensor<3x3xi32> +// CHECK: return %[[cst]] +func.func @pad_of_ints_no_high_dims() -> tensor<3x3xi32> { + %init = arith.constant dense<[[6, 7], [8, 9]]> : tensor<2x2xi32> + %pad_value = arith.constant 0 : i32 + + %0 = tensor.pad %init low[1, 1] high[0, 0] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : i32 + } : tensor<2x2xi32> to tensor<3x3xi32> + + return %0 : tensor<3x3xi32> +} + +// CHECK-LABEL: func @pad_multi_use_do_not_fold( +// CHECK: %[[pad:.+]] = tensor.pad +// CHECK: return %[[pad]] +func.func @pad_multi_use_do_not_fold() -> (tensor<?x?xi32>, tensor<2x2xi32>) { + %init = arith.constant dense<[[6, 7], [8, 9]]> : tensor<2x2xi32> + %pad_value = arith.constant 0 : i32 + + %c1 = arith.constant 1 : index + + %0 = tensor.pad %init low[%c1, %c1] high[%c1, %c1] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : i32 + } : tensor<2x2xi32> to tensor<?x?xi32> + + return %0, %init : tensor<?x?xi32>, tensor<2x2xi32> +} + +// ----- + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%root : !transform.any_op {transform.readonly}) { + %func_op = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.op<"func.func"> + transform.apply_patterns to %func_op { + transform.apply_patterns.tensor.rewrite_as_constant aggressive + } : !transform.op<"func.func"> + transform.yield + } +} + +// CHECK-LABEL: func @pad_aggressive_fold( +// CHECK: %[[init:.*]] = arith.constant dense<7> : tensor<2x2xi32> +// CHECK: %[[cst:.*]] = arith.constant dense<[ +// CHECK-SAME{LITERAL}: [0, 0, 0, 0], +// CHECK-SAME{LITERAL}: [0, 7, 7, 0], +// CHECK-SAME{LITERAL}: [0, 7, 7, 0], +// CHECK-SAME{LITERAL}: [0, 0, 0, 0] +// CHECK-SAME{LITERAL}: ]> : tensor<4x4xi32> +// CHECK: %[[cast:.*]] = tensor.cast %[[cst]] : tensor<4x4xi32> to tensor<?x?xi32> +// CHECK: return %[[cast]] +func.func @pad_aggressive_fold() -> (tensor<?x?xi32>, tensor<2x2xi32>) { + %init = arith.constant dense<7> : tensor<2x2xi32> + %pad_value = arith.constant 0 : i32 + + %c1 = arith.constant 1 : index + + %0 = tensor.pad %init low[%c1, %c1] high[%c1, %c1] { + ^bb0(%arg1: index, %arg2: index): + tensor.yield %pad_value : i32 + } : tensor<2x2xi32> to tensor<?x?xi32> + + return %0, %init : tensor<?x?xi32>, tensor<2x2xi32> +} diff --git a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir index 788ae9a..65bf0b9 100644 --- a/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-flatten.mlir @@ -471,16 +471,16 @@ func.func @regression_non_contiguous_dim_read(%subview : memref<1x3x3x2xf32, str } // CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)> -// CHECK-LABEL: func.func @regression_non_contiguous_dim_read( -// CHECK: %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>> -// CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() +// CHECK-LABEL: func.func @regression_non_contiguous_dim_read( +// CHECK: %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>> +// CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() // CHECK-128B-LABEL: func @regression_non_contiguous_dim_read( // CHECK-128B: memref.collapse_shape // ----- -func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, +func.func @regression_non_contiguous_dim_write(%value : vector<2x2xf32>, %subview : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>>, %idx0 : index, %idx1 : index) { %c0 = arith.constant 0 : index @@ -488,8 +488,35 @@ func.func @unsupported_non_contiguous_dim_write(%value : vector<2x2xf32>, return } -// CHECK-LABEL: func.func @unsupported_non_contiguous_dim_write( -// CHECK-NOT: memref.collapse_shape +// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (s0 * 2)> +// CHECK-LABEL: func.func @regression_non_contiguous_dim_write( +// CHECK: %[[APPLY:.*]] = affine.apply #[[$MAP]]() +// CHECK: %[[COLLAPSE:.+]] = memref.collapse_shape %{{.*}} {{\[}}[0], [1], [2, 3]] : memref<1x3x3x2xf32, strided<[40, 10, 2, 1], offset: ?>> into memref<1x3x6xf32, strided<[40, 10, 1], offset: ?>> -// CHECK-128B-LABEL: func @unsupported_non_contiguous_dim_write( -// CHECK-128B-NOT: memref.collapse_shape +// CHECK-128B-LABEL: func @regression_non_contiguous_dim_write( +// CHECK-128B: memref.collapse_shape + +// ----- + +func.func @negative_out_of_bound_transfer_read( + %arg : memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>) -> vector<5x4x3x2xi8> { + %c0 = arith.constant 0 : index + %cst = arith.constant 0 : i8 + %v = vector.transfer_read %arg[%c0, %c0, %c0, %c0], %cst {in_bounds = [false, true, true, true]} : + memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, vector<5x4x3x2xi8> + return %v : vector<5x4x3x2xi8> +} +// CHECK: func.func @negative_out_of_bound_transfer_read +// CHECK-NOT: memref.collapse_shape + +// ----- + +func.func @negative_out_of_bound_transfer_write( + %arg : memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>>, %vec : vector<1x1x3x2xi8>) { + %c0 = arith.constant 0 : index + vector.transfer_write %vec, %arg [%c0, %c0, %c0, %c0] {in_bounds = [false, true, true, true]} : + vector<1x1x3x2xi8>, memref<?x4x3x2xi8, strided<[24, 6, 2, 1], offset: ?>> + return +} +// CHECK: func.func @negative_out_of_bound_transfer_write +// CHECK-NOT: memref.collapse_shape diff --git a/mlir/test/IR/array-of-attr.mlir b/mlir/test/IR/array-of-attr.mlir index 1b6fe55..c2a6075 100644 --- a/mlir/test/IR/array-of-attr.mlir +++ b/mlir/test/IR/array-of-attr.mlir @@ -12,3 +12,7 @@ test.array_of_attr_op // CHECK: test.array_of_attr_op // CHECK-SAME: a = [], b = [], c = [] test.array_of_attr_op a = [], b = [], c = [] + +// CHECK: "test.test_array_float" +// CHECK-SAME: 1.000000e+00 : f32, 1.000000e+00, 0x7FF0000000000000 : f64 +"test.test_array_float"() {test.float_arr = [1.0 : f32, 1.0 : f64, 0x7FF0000000000000 : f64]} : () -> () diff --git a/mlir/test/python/ir/blocks.py b/mlir/test/python/ir/blocks.py index 8b4d946..70ccaee 100644 --- a/mlir/test/python/ir/blocks.py +++ b/mlir/test/python/ir/blocks.py @@ -145,3 +145,35 @@ def testBlockHash(): block1 = Block.create_at_start(dummy.operation.regions[0], [f32]) block2 = Block.create_at_start(dummy.operation.regions[0], [f32]) assert hash(block1) != hash(block2) + + +# CHECK-LABEL: TEST: testBlockAddArgs +@run +def testBlockAddArgs(): + with Context() as ctx, Location.unknown(ctx) as loc: + ctx.allow_unregistered_dialects = True + f32 = F32Type.get() + op = Operation.create("test", regions=1, loc=Location.unknown()) + blocks = op.regions[0].blocks + blocks.append() + # CHECK: ^bb0: + op.print(enable_debug_info=True) + blocks[0].add_argument(f32, loc) + # CHECK: ^bb0(%{{.+}}: f32 loc(unknown)): + op.print(enable_debug_info=True) + + +# CHECK-LABEL: TEST: testBlockEraseArgs +@run +def testBlockEraseArgs(): + with Context() as ctx, Location.unknown(ctx) as loc: + ctx.allow_unregistered_dialects = True + f32 = F32Type.get() + op = Operation.create("test", regions=1, loc=Location.unknown()) + blocks = op.regions[0].blocks + blocks.append(f32) + # CHECK: ^bb0(%{{.+}}: f32 loc(unknown)): + op.print(enable_debug_info=True) + blocks[0].erase_argument(0) + # CHECK: ^bb0: + op.print(enable_debug_info=True) diff --git a/offload/DeviceRTL/CMakeLists.txt b/offload/DeviceRTL/CMakeLists.txt index b3560036..d88430a 100644 --- a/offload/DeviceRTL/CMakeLists.txt +++ b/offload/DeviceRTL/CMakeLists.txt @@ -46,7 +46,7 @@ set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" "gfx908;gfx90a;gfx90c;gfx940;gfx941;gfx942;gfx1010" "gfx1030;gfx1031;gfx1032;gfx1033;gfx1034;gfx1035" "gfx1036;gfx1100;gfx1101;gfx1102;gfx1103;gfx1150" - "gfx1151") + "gfx1151;gfx1152") set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" "sm_70;sm_72;sm_75;sm_80;sm_86;sm_87;sm_89;sm_90") set(all_gpu_architectures diff --git a/offload/include/PluginManager.h b/offload/include/PluginManager.h index 1d6804d..fce2adc 100644 --- a/offload/include/PluginManager.h +++ b/offload/include/PluginManager.h @@ -64,10 +64,6 @@ struct PluginManager { std::make_unique<DeviceImageTy>(TgtBinDesc, TgtDeviceImage)); } - /// Initialize as many devices as possible for this plugin. Devices that fail - /// to initialize are ignored. - void initDevices(GenericPluginTy &RTL); - /// Return the device presented to the user as device \p DeviceNo if it is /// initialized and ready. Otherwise return an error explaining the problem. llvm::Expected<DeviceTy &> getDevice(uint32_t DeviceNo); @@ -117,20 +113,31 @@ struct PluginManager { return Devices.getExclusiveAccessor(); } - int getNumUsedPlugins() const { return DeviceOffsets.size(); } - // Initialize all plugins. void initAllPlugins(); /// Iterator range for all plugins (in use or not, but always valid). auto plugins() { return llvm::make_pointee_range(Plugins); } + /// Iterator range for all plugins (in use or not, but always valid). + auto plugins() const { return llvm::make_pointee_range(Plugins); } + /// Return the user provided requirements. int64_t getRequirements() const { return Requirements.getRequirements(); } /// Add \p Flags to the user provided requirements. void addRequirements(int64_t Flags) { Requirements.addRequirements(Flags); } + /// Returns the number of plugins that are active. + int getNumActivePlugins() const { + int count = 0; + for (auto &R : plugins()) + if (R.is_initialized()) + ++count; + + return count; + } + private: bool RTLsLoaded = false; llvm::SmallVector<__tgt_bin_desc *> DelayedBinDesc; @@ -138,11 +145,9 @@ private: // List of all plugins, in use or not. llvm::SmallVector<std::unique_ptr<GenericPluginTy>> Plugins; - // Mapping of plugins to offsets in the device table. - llvm::DenseMap<const GenericPluginTy *, int32_t> DeviceOffsets; - - // Mapping of plugins to the number of used devices. - llvm::DenseMap<const GenericPluginTy *, int32_t> DeviceUsed; + // Mapping of plugins to the OpenMP device identifier. + llvm::DenseMap<std::pair<const GenericPluginTy *, int32_t>, int32_t> + DeviceIds; // Set of all device images currently in use. llvm::DenseSet<const __tgt_device_image *> UsedImages; diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index c6dd954..663cfdc 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3163,25 +3163,24 @@ struct AMDGPUPluginTy final : public GenericPluginTy { uint16_t getMagicElfBits() const override { return ELF::EM_AMDGPU; } /// Check whether the image is compatible with an AMDGPU device. - Expected<bool> isELFCompatible(StringRef Image) const override { + Expected<bool> isELFCompatible(uint32_t DeviceId, + StringRef Image) const override { // Get the associated architecture and flags from the ELF. auto ElfOrErr = ELF64LEObjectFile::create( MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false); if (!ElfOrErr) return ElfOrErr.takeError(); std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName(); + if (!Processor) + return false; - for (hsa_agent_t Agent : KernelAgents) { - auto TargeTripleAndFeaturesOrError = - utils::getTargetTripleAndFeatures(Agent); - if (!TargeTripleAndFeaturesOrError) - return TargeTripleAndFeaturesOrError.takeError(); - if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "", + auto TargeTripleAndFeaturesOrError = + utils::getTargetTripleAndFeatures(getKernelAgent(DeviceId)); + if (!TargeTripleAndFeaturesOrError) + return TargeTripleAndFeaturesOrError.takeError(); + return utils::isImageCompatibleWithEnv(Processor ? *Processor : "", ElfOrErr->getPlatformFlags(), - *TargeTripleAndFeaturesOrError)) - return false; - } - return true; + *TargeTripleAndFeaturesOrError); } bool isDataExchangable(int32_t SrcDeviceId, int32_t DstDeviceId) override { @@ -3273,19 +3272,13 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice, if (ArgsSize < KernelArgsSize) return Plugin::error("Mismatch of kernel arguments size"); - // The args size reported by HSA may or may not contain the implicit args. - // For now, assume that HSA does not consider the implicit arguments when - // reporting the arguments of a kernel. In the worst case, we can waste - // 56 bytes per allocation. - uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize; - AMDGPUPluginTy &AMDGPUPlugin = static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin); AMDHostDeviceTy &HostDevice = AMDGPUPlugin.getHostDevice(); AMDGPUMemoryManagerTy &ArgsMemoryManager = HostDevice.getArgsMemoryManager(); void *AllArgs = nullptr; - if (auto Err = ArgsMemoryManager.allocate(AllArgsSize, &AllArgs)) + if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs)) return Err; // Account for user requested dynamic shared memory. diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index eda6a4f..88423be 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -993,11 +993,11 @@ struct GenericPluginTy { /// Get the number of active devices. int32_t getNumDevices() const { return NumDevices; } - /// Get the plugin-specific device identifier offset. - int32_t getDeviceIdStartIndex() const { return DeviceIdStartIndex; } - - /// Set the plugin-specific device identifier offset. - void setDeviceIdStartIndex(int32_t Offset) { DeviceIdStartIndex = Offset; } + /// Get the plugin-specific device identifier. + int32_t getUserId(int32_t DeviceId) const { + assert(UserDeviceIds.contains(DeviceId) && "No user-id registered"); + return UserDeviceIds.at(DeviceId); + } /// Get the ELF code to recognize the binary image of this plugin. virtual uint16_t getMagicElfBits() const = 0; @@ -1059,7 +1059,8 @@ struct GenericPluginTy { /// Indicate if an image is compatible with the plugin devices. Notice that /// this function may be called before actually initializing the devices. So /// we could not move this function into GenericDeviceTy. - virtual Expected<bool> isELFCompatible(StringRef Image) const = 0; + virtual Expected<bool> isELFCompatible(uint32_t DeviceID, + StringRef Image) const = 0; protected: /// Indicate whether a device id is valid. @@ -1070,11 +1071,18 @@ protected: public: // TODO: This plugin interface needs to be cleaned up. - /// Returns true if the plugin has been initialized. + /// Returns non-zero if the plugin runtime has been initialized. int32_t is_initialized() const; - /// Returns non-zero if the provided \p Image can be executed by the runtime. - int32_t is_valid_binary(__tgt_device_image *Image, bool Initialized = true); + /// Returns non-zero if the \p Image is compatible with the plugin. This + /// function does not require the plugin to be initialized before use. + int32_t is_plugin_compatible(__tgt_device_image *Image); + + /// Returns non-zero if the \p Image is compatible with the device. + int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image); + + /// Returns non-zero if the plugin device has been initialized. + int32_t is_device_initialized(int32_t DeviceId) const; /// Initialize the device inside of the plugin. int32_t init_device(int32_t DeviceId); @@ -1180,7 +1188,7 @@ public: const char **ErrStr); /// Sets the offset into the devices for use by OMPT. - int32_t set_device_offset(int32_t DeviceIdOffset); + int32_t set_device_identifier(int32_t UserId, int32_t DeviceId); /// Returns if the plugin can support auotmatic copy. int32_t use_auto_zero_copy(int32_t DeviceId); @@ -1200,10 +1208,8 @@ private: /// Number of devices available for the plugin. int32_t NumDevices = 0; - /// Index offset, which when added to a DeviceId, will yield a unique - /// user-observable device identifier. This is especially important when - /// DeviceIds of multiple plugins / RTLs need to be distinguishable. - int32_t DeviceIdStartIndex = 0; + /// Map of plugin device identifiers to the user device identifier. + llvm::DenseMap<int32_t, int32_t> UserDeviceIds; /// Array of pointers to the devices. Initially, they are all set to nullptr. /// Once a device is initialized, the pointer is stored in the position given diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index 913721a..5a53c47 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -748,8 +748,7 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) { if (ompt::Initialized) { bool ExpectedStatus = false; if (OmptInitialized.compare_exchange_strong(ExpectedStatus, true)) - performOmptCallback(device_initialize, /*device_num=*/DeviceId + - Plugin.getDeviceIdStartIndex(), + performOmptCallback(device_initialize, Plugin.getUserId(DeviceId), /*type=*/getComputeUnitKind().c_str(), /*device=*/reinterpret_cast<ompt_device_t *>(this), /*lookup=*/ompt::lookupCallbackByName, @@ -847,9 +846,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { if (ompt::Initialized) { bool ExpectedStatus = true; if (OmptInitialized.compare_exchange_strong(ExpectedStatus, false)) - performOmptCallback(device_finalize, - /*device_num=*/DeviceId + - Plugin.getDeviceIdStartIndex()); + performOmptCallback(device_finalize, Plugin.getUserId(DeviceId)); } #endif @@ -908,7 +905,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, size_t Bytes = getPtrDiff(InputTgtImage->ImageEnd, InputTgtImage->ImageStart); performOmptCallback( - device_load, /*device_num=*/DeviceId + Plugin.getDeviceIdStartIndex(), + device_load, Plugin.getUserId(DeviceId), /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr, /*ImgSize=*/Bytes, /*HostAddr=*/InputTgtImage->ImageStart, /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0); @@ -1492,11 +1489,14 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) { bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); } Error GenericPluginTy::init() { + if (Initialized) + return Plugin::success(); + auto NumDevicesOrErr = initImpl(); if (!NumDevicesOrErr) return NumDevicesOrErr.takeError(); - Initialized = true; + NumDevices = *NumDevicesOrErr; if (NumDevices == 0) return Plugin::success(); @@ -1517,6 +1517,8 @@ Error GenericPluginTy::init() { } Error GenericPluginTy::deinit() { + assert(Initialized && "Plugin was not initialized!"); + // Deinitialize all active devices. for (int32_t DeviceId = 0; DeviceId < NumDevices; ++DeviceId) { if (Devices[DeviceId]) { @@ -1537,7 +1539,11 @@ Error GenericPluginTy::deinit() { delete RecordReplay; // Perform last deinitializations on the plugin. - return deinitImpl(); + if (Error Err = deinitImpl()) + return Err; + Initialized = false; + + return Plugin::success(); } Error GenericPluginTy::initDevice(int32_t DeviceId) { @@ -1599,8 +1605,7 @@ Expected<bool> GenericPluginTy::checkBitcodeImage(StringRef Image) const { int32_t GenericPluginTy::is_initialized() const { return Initialized; } -int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image, - bool Initialized) { +int32_t GenericPluginTy::is_plugin_compatible(__tgt_device_image *Image) { StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); @@ -1618,11 +1623,43 @@ int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image, auto MatchOrErr = checkELFImage(Buffer); if (Error Err = MatchOrErr.takeError()) return HandleError(std::move(Err)); - if (!Initialized || !*MatchOrErr) - return *MatchOrErr; + return *MatchOrErr; + } + case file_magic::bitcode: { + auto MatchOrErr = checkBitcodeImage(Buffer); + if (Error Err = MatchOrErr.takeError()) + return HandleError(std::move(Err)); + return *MatchOrErr; + } + default: + return false; + } +} + +int32_t GenericPluginTy::is_device_compatible(int32_t DeviceId, + __tgt_device_image *Image) { + StringRef Buffer(reinterpret_cast<const char *>(Image->ImageStart), + target::getPtrDiff(Image->ImageEnd, Image->ImageStart)); + + auto HandleError = [&](Error Err) -> bool { + [[maybe_unused]] std::string ErrStr = toString(std::move(Err)); + DP("Failure to check validity of image %p: %s", Image, ErrStr.c_str()); + return false; + }; + switch (identify_magic(Buffer)) { + case file_magic::elf: + case file_magic::elf_relocatable: + case file_magic::elf_executable: + case file_magic::elf_shared_object: + case file_magic::elf_core: { + auto MatchOrErr = checkELFImage(Buffer); + if (Error Err = MatchOrErr.takeError()) + return HandleError(std::move(Err)); + if (!*MatchOrErr) + return false; // Perform plugin-dependent checks for the specific architecture if needed. - auto CompatibleOrErr = isELFCompatible(Buffer); + auto CompatibleOrErr = isELFCompatible(DeviceId, Buffer); if (Error Err = CompatibleOrErr.takeError()) return HandleError(std::move(Err)); return *CompatibleOrErr; @@ -1638,6 +1675,10 @@ int32_t GenericPluginTy::is_valid_binary(__tgt_device_image *Image, } } +int32_t GenericPluginTy::is_device_initialized(int32_t DeviceId) const { + return isValidDeviceId(DeviceId) && Devices[DeviceId] != nullptr; +} + int32_t GenericPluginTy::init_device(int32_t DeviceId) { auto Err = initDevice(DeviceId); if (Err) { @@ -1985,8 +2026,9 @@ int32_t GenericPluginTy::init_device_info(int32_t DeviceId, return OFFLOAD_SUCCESS; } -int32_t GenericPluginTy::set_device_offset(int32_t DeviceIdOffset) { - setDeviceIdStartIndex(DeviceIdOffset); +int32_t GenericPluginTy::set_device_identifier(int32_t UserId, + int32_t DeviceId) { + UserDeviceIds[DeviceId] = UserId; return OFFLOAD_SUCCESS; } diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index b260334..62460c0 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1388,8 +1388,9 @@ struct CUDAPluginTy final : public GenericPluginTy { const char *getName() const override { return GETNAME(TARGET_NAME); } - /// Check whether the image is compatible with the available CUDA devices. - Expected<bool> isELFCompatible(StringRef Image) const override { + /// Check whether the image is compatible with a CUDA device. + Expected<bool> isELFCompatible(uint32_t DeviceId, + StringRef Image) const override { auto ElfOrErr = ELF64LEObjectFile::create(MemoryBufferRef(Image, /*Identifier=*/""), /*InitContent=*/false); @@ -1399,33 +1400,29 @@ struct CUDAPluginTy final : public GenericPluginTy { // Get the numeric value for the image's `sm_` value. auto SM = ElfOrErr->getPlatformFlags() & ELF::EF_CUDA_SM; - for (int32_t DevId = 0; DevId < getNumDevices(); ++DevId) { - CUdevice Device; - CUresult Res = cuDeviceGet(&Device, DevId); - if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) - return std::move(Err); - - int32_t Major, Minor; - Res = cuDeviceGetAttribute( - &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device); - if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) - return std::move(Err); - - Res = cuDeviceGetAttribute( - &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device); - if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) - return std::move(Err); - - int32_t ImageMajor = SM / 10; - int32_t ImageMinor = SM % 10; - - // A cubin generated for a certain compute capability is supported to - // run on any GPU with the same major revision and same or higher minor - // revision. - if (Major != ImageMajor || Minor < ImageMinor) - return false; - } - return true; + CUdevice Device; + CUresult Res = cuDeviceGet(&Device, DeviceId); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGet: %s")) + return std::move(Err); + + int32_t Major, Minor; + Res = cuDeviceGetAttribute( + &Major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, Device); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) + return std::move(Err); + + Res = cuDeviceGetAttribute( + &Minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, Device); + if (auto Err = Plugin::check(Res, "Error in cuDeviceGetAttribute: %s")) + return std::move(Err); + + int32_t ImageMajor = SM / 10; + int32_t ImageMinor = SM % 10; + + // A cubin generated for a certain compute capability is supported to + // run on any GPU with the same major revision and same or higher minor + // revision. + return Major == ImageMajor && Minor >= ImageMinor; } }; diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index ef84cba..aa59ea6 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -418,7 +418,9 @@ struct GenELF64PluginTy final : public GenericPluginTy { } /// All images (ELF-compatible) should be compatible with this plugin. - Expected<bool> isELFCompatible(StringRef) const override { return true; } + Expected<bool> isELFCompatible(uint32_t, StringRef) const override { + return true; + } Triple::ArchType getTripleArch() const override { #if defined(__x86_64__) diff --git a/offload/src/PluginManager.cpp b/offload/src/PluginManager.cpp index 13f08b1..5e8f917 100644 --- a/offload/src/PluginManager.cpp +++ b/offload/src/PluginManager.cpp @@ -47,6 +47,9 @@ void PluginManager::deinit() { DP("Unloading RTLs...\n"); for (auto &Plugin : Plugins) { + if (!Plugin->is_initialized()) + continue; + if (auto Err = Plugin->deinit()) { [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); DP("Failed to deinit plugin: %s\n", InfoMsg.c_str()); @@ -57,90 +60,15 @@ void PluginManager::deinit() { DP("RTLs unloaded!\n"); } -void PluginManager::initDevices(GenericPluginTy &RTL) { - // If this RTL has already been initialized. - if (PM->DeviceOffsets.contains(&RTL)) - return; - TIMESCOPE(); - - // If this RTL is not already in use, initialize it. - assert(RTL.number_of_devices() > 0 && "Tried to initialize useless plugin!"); - - // Initialize the device information for the RTL we are about to use. - auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor(); - - // Initialize the index of this RTL and save it in the used RTLs. - int32_t DeviceOffset = ExclusiveDevicesAccessor->size(); - - // Set the device identifier offset in the plugin. - RTL.set_device_offset(DeviceOffset); - - int32_t NumberOfUserDevices = 0; - int32_t NumPD = RTL.number_of_devices(); - ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD); - // Auto zero-copy is a per-device property. We need to ensure - // that all devices are suggesting to use it. - bool UseAutoZeroCopy = !(NumPD == 0); - for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) { - auto Device = std::make_unique<DeviceTy>(&RTL, UserDevId, PDevI); - if (auto Err = Device->init()) { - DP("Skip plugin known device %d: %s\n", PDevI, - toString(std::move(Err)).c_str()); - continue; - } - UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy(); - - ExclusiveDevicesAccessor->push_back(std::move(Device)); - ++NumberOfUserDevices; - ++UserDevId; - } - - // Auto Zero-Copy can only be currently triggered when the system is an - // homogeneous APU architecture without attached discrete GPUs. - // If all devices suggest to use it, change requirment flags to trigger - // zero-copy behavior when mapping memory. - if (UseAutoZeroCopy) - addRequirements(OMPX_REQ_AUTO_ZERO_COPY); - - DeviceOffsets[&RTL] = DeviceOffset; - DeviceUsed[&RTL] = NumberOfUserDevices; - DP("Plugin has index %d, exposes %d out of %d devices!\n", DeviceOffset, - NumberOfUserDevices, RTL.number_of_devices()); -} - void PluginManager::initAllPlugins() { - for (auto &R : Plugins) - initDevices(*R); -} - -static void registerImageIntoTranslationTable(TranslationTable &TT, - int32_t DeviceOffset, - int32_t NumberOfUserDevices, - __tgt_device_image *Image) { - - // same size, as when we increase one, we also increase the other. - assert(TT.TargetsTable.size() == TT.TargetsImages.size() && - "We should have as many images as we have tables!"); - - // Resize the Targets Table and Images to accommodate the new targets if - // required - unsigned TargetsTableMinimumSize = DeviceOffset + NumberOfUserDevices; - - if (TT.TargetsTable.size() < TargetsTableMinimumSize) { - TT.DeviceTables.resize(TargetsTableMinimumSize, {}); - TT.TargetsImages.resize(TargetsTableMinimumSize, 0); - TT.TargetsEntries.resize(TargetsTableMinimumSize, {}); - TT.TargetsTable.resize(TargetsTableMinimumSize, 0); - } - - // Register the image in all devices for this target type. - for (int32_t I = 0; I < NumberOfUserDevices; ++I) { - // If we are changing the image we are also invalidating the target table. - if (TT.TargetsImages[DeviceOffset + I] != Image) { - TT.TargetsImages[DeviceOffset + I] = Image; - TT.TargetsTable[DeviceOffset + I] = - 0; // lazy initialization of target table. + for (auto &R : plugins()) { + if (auto Err = R.init()) { + [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); + DP("Failed to init plugin: %s\n", InfoMsg.c_str()); + continue; } + DP("Registered plugin %s with %d visible device(s)\n", R.getName(), + R.number_of_devices()); } } @@ -153,27 +81,6 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { if (Entry.flags == OMP_REGISTER_REQUIRES) PM->addRequirements(Entry.data); - // Initialize all the plugins that have associated images. - for (auto &Plugin : Plugins) { - // Extract the exectuable image and extra information if availible. - for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) { - if (Plugin->is_initialized()) - continue; - - if (!Plugin->is_valid_binary(&Desc->DeviceImages[i], - /*Initialized=*/false)) - continue; - - if (auto Err = Plugin->init()) { - [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); - DP("Failed to init plugin: %s\n", InfoMsg.c_str()); - } else { - DP("Registered plugin %s with %d visible device(s)\n", - Plugin->getName(), Plugin->number_of_devices()); - } - } - } - // Extract the exectuable image and extra information if availible. for (int32_t i = 0; i < Desc->NumDeviceImages; ++i) PM->addDeviceImage(*Desc, Desc->DeviceImages[i]); @@ -188,54 +95,110 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { // Scan the RTLs that have associated images until we find one that supports // the current image. for (auto &R : PM->plugins()) { - if (!R.number_of_devices()) + if (!R.is_plugin_compatible(Img)) continue; - if (!R.is_valid_binary(Img, /*Initialized=*/true)) { - DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", - DPxPTR(Img->ImageStart), R.getName()); - continue; + if (!R.is_initialized()) { + if (auto Err = R.init()) { + [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); + DP("Failed to init plugin: %s\n", InfoMsg.c_str()); + continue; + } + DP("Registered plugin %s with %d visible device(s)\n", R.getName(), + R.number_of_devices()); } - DP("Image " DPxMOD " is compatible with RTL %s!\n", - DPxPTR(Img->ImageStart), R.getName()); - - PM->initDevices(R); + if (!R.number_of_devices()) { + DP("Skipping plugin %s with no visible devices\n", R.getName()); + continue; + } - // Initialize (if necessary) translation table for this library. - PM->TrlTblMtx.lock(); - if (!PM->HostEntriesBeginToTransTable.count(Desc->HostEntriesBegin)) { - PM->HostEntriesBeginRegistrationOrder.push_back(Desc->HostEntriesBegin); - TranslationTable &TransTable = + for (int32_t DeviceId = 0; DeviceId < R.number_of_devices(); ++DeviceId) { + if (!R.is_device_compatible(DeviceId, Img)) + continue; + + DP("Image " DPxMOD " is compatible with RTL %s device %d!\n", + DPxPTR(Img->ImageStart), R.getName(), DeviceId); + + if (!R.is_device_initialized(DeviceId)) { + // Initialize the device information for the RTL we are about to use. + auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor(); + + int32_t UserId = ExclusiveDevicesAccessor->size(); + + // Set the device identifier offset in the plugin. +#ifdef OMPT_SUPPORT + R.set_device_identifier(UserId, DeviceId); +#endif + + auto Device = std::make_unique<DeviceTy>(&R, UserId, DeviceId); + if (auto Err = Device->init()) { + [[maybe_unused]] std::string InfoMsg = toString(std::move(Err)); + DP("Failed to init device %d: %s\n", DeviceId, InfoMsg.c_str()); + continue; + } + + ExclusiveDevicesAccessor->push_back(std::move(Device)); + + // We need to map between the plugin's device identifier and the one + // that OpenMP will use. + PM->DeviceIds[std::make_pair(&R, DeviceId)] = UserId; + } + + // Initialize (if necessary) translation table for this library. + PM->TrlTblMtx.lock(); + if (!PM->HostEntriesBeginToTransTable.count(Desc->HostEntriesBegin)) { + PM->HostEntriesBeginRegistrationOrder.push_back( + Desc->HostEntriesBegin); + TranslationTable &TT = + (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin]; + TT.HostTable.EntriesBegin = Desc->HostEntriesBegin; + TT.HostTable.EntriesEnd = Desc->HostEntriesEnd; + } + + // Retrieve translation table for this library. + TranslationTable &TT = (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin]; - TransTable.HostTable.EntriesBegin = Desc->HostEntriesBegin; - TransTable.HostTable.EntriesEnd = Desc->HostEntriesEnd; - } - // Retrieve translation table for this library. - TranslationTable &TransTable = - (PM->HostEntriesBeginToTransTable)[Desc->HostEntriesBegin]; + DP("Registering image " DPxMOD " with RTL %s!\n", + DPxPTR(Img->ImageStart), R.getName()); - DP("Registering image " DPxMOD " with RTL %s!\n", DPxPTR(Img->ImageStart), - R.getName()); + auto UserId = PM->DeviceIds[std::make_pair(&R, DeviceId)]; + if (TT.TargetsTable.size() < static_cast<size_t>(UserId + 1)) { + TT.DeviceTables.resize(UserId + 1, {}); + TT.TargetsImages.resize(UserId + 1, nullptr); + TT.TargetsEntries.resize(UserId + 1, {}); + TT.TargetsTable.resize(UserId + 1, nullptr); + } - registerImageIntoTranslationTable(TransTable, PM->DeviceOffsets[&R], - PM->DeviceUsed[&R], Img); - PM->UsedImages.insert(Img); + // Register the image for this target type and invalidate the table. + TT.TargetsImages[UserId] = Img; + TT.TargetsTable[UserId] = nullptr; - PM->TrlTblMtx.unlock(); - FoundRTL = &R; + PM->UsedImages.insert(Img); + FoundRTL = &R; - // if an RTL was found we are done - proceed to register the next image - break; + PM->TrlTblMtx.unlock(); + } } - - if (!FoundRTL) { + if (!FoundRTL) DP("No RTL found for image " DPxMOD "!\n", DPxPTR(Img->ImageStart)); - } } PM->RTLsMtx.unlock(); + bool UseAutoZeroCopy = Plugins.size() > 0; + + auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor(); + for (const auto &Device : *ExclusiveDevicesAccessor) + UseAutoZeroCopy &= Device->useAutoZeroCopy(); + + // Auto Zero-Copy can only be currently triggered when the system is an + // homogeneous APU architecture without attached discrete GPUs. + // If all devices suggest to use it, change requirment flags to trigger + // zero-copy behavior when mapping memory. + if (UseAutoZeroCopy) + addRequirements(OMPX_REQ_AUTO_ZERO_COPY); + DP("Done registering entries!\n"); } @@ -257,7 +220,7 @@ void PluginManager::unregisterLib(__tgt_bin_desc *Desc) { // Scan the RTLs that have associated images until we find one that supports // the current image. We only need to scan RTLs that are already being used. for (auto &R : PM->plugins()) { - if (!DeviceOffsets.contains(&R)) + if (R.is_initialized()) continue; // Ensure that we do not use any unused images associated with this RTL. diff --git a/offload/src/omptarget.cpp b/offload/src/omptarget.cpp index 91e1213..9bca852 100644 --- a/offload/src/omptarget.cpp +++ b/offload/src/omptarget.cpp @@ -315,7 +315,7 @@ void handleTargetOutcome(bool Success, ident_t *Loc) { FAILURE_MESSAGE("Consult https://openmp.llvm.org/design/Runtimes.html " "for debugging options.\n"); - if (!PM->getNumUsedPlugins()) { + if (!PM->getNumActivePlugins()) { FAILURE_MESSAGE( "No images found compatible with the installed hardware. "); diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp index d2569a5..c924689 100644 --- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp +++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp @@ -23,7 +23,7 @@ bool equal(T LHS, T RHS) { template <typename T, std::enable_if_t<std::is_floating_point<T>::value, bool> = true> bool equal(T LHS, T RHS) { - return std::abs(LHS - RHS) < std::numeric_limits<T>::epsilon(); + return __builtin_fabs(LHS - RHS) < std::numeric_limits<T>::epsilon(); } template <typename T> void test() { diff --git a/openmp/CMakeLists.txt b/openmp/CMakeLists.txt index c228a39..e565354 100644 --- a/openmp/CMakeLists.txt +++ b/openmp/CMakeLists.txt @@ -137,8 +137,10 @@ if (OPENMP_ENABLE_OMPT_TOOLS) endif() # Propagate OMPT support to offload -set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE) -set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE) +if(NOT ${OPENMP_STANDALONE_BUILD}) + set(LIBOMP_HAVE_OMPT_SUPPORT ${LIBOMP_HAVE_OMPT_SUPPORT} PARENT_SCOPE) + set(LIBOMP_OMP_TOOLS_INCLUDE_DIR ${LIBOMP_OMP_TOOLS_INCLUDE_DIR} PARENT_SCOPE) +endif() option(OPENMP_MSVC_NAME_SCHEME "Build dll with MSVC naming scheme." OFF) diff --git a/third-party/unittest/googletest/include/gtest/internal/gtest-port.h b/third-party/unittest/googletest/include/gtest/internal/gtest-port.h index a17349e..02e1eb0 100644 --- a/third-party/unittest/googletest/include/gtest/internal/gtest-port.h +++ b/third-party/unittest/googletest/include/gtest/internal/gtest-port.h @@ -652,7 +652,7 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION; // Determines whether to support death tests. // pops up a dialog window that cannot be suppressed programmatically. #if (defined(GTEST_OS_LINUX) || defined(GTEST_OS_CYGWIN) || \ - defined(GTEST_OS_SOLARIS) || \ + defined(GTEST_OS_SOLARIS) || defined(GTEST_OS_ZOS) || \ (defined(GTEST_OS_MAC) && !defined(GTEST_OS_IOS)) || \ (defined(GTEST_OS_WINDOWS_DESKTOP) && _MSC_VER) || \ defined(GTEST_OS_WINDOWS_MINGW) || defined(GTEST_OS_AIX) || \ diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index f3809bd..96cc895 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1111,7 +1111,7 @@ libc_support_library( ], defines = [ "LIBC_COPT_TIMEOUT_ENSURE_MONOTONICITY", - "LIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT" + "LIBC_COPT_RAW_MUTEX_DEFAULT_SPIN_COUNT", ], target_compatible_with = select({ "@platforms//os:linux": [], @@ -1119,9 +1119,9 @@ libc_support_library( }), deps = [ ":__support_cpp_optional", - ":__support_time_linux", ":__support_threads_linux_futex_utils", ":__support_threads_sleep", + ":__support_time_linux", ":types_pid_t", ], ) @@ -3580,6 +3580,7 @@ libc_function( }), weak = True, deps = [ + ":__support_macros_sanitizer", ":__support_osutil_syscall", ":errno", ":hdr_signal_macros", @@ -3599,6 +3600,7 @@ libc_function( }), weak = True, deps = [ + ":__support_macros_sanitizer", ":__support_osutil_syscall", ":errno", ":hdr_signal_macros", @@ -3620,6 +3622,7 @@ libc_function( # }), # weak = True, # deps = [ +# ":__support_macros_sanitizer", # ":__support_osutil_syscall", # ":errno", # ":hdr_signal_macros", diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel index 3980ef6..c8001fe 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/BUILD.bazel @@ -99,6 +99,7 @@ libc_test( name = "fixedvector_test", srcs = ["fixedvector_test.cpp"], deps = [ + "//libc:__support_cpp_array", "//libc:__support_fixedvector", ], ) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel index 4f72a0a..fac692a 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel @@ -529,7 +529,9 @@ libc_support_library( "//libc:__support_cpp_bit", "//libc:__support_cpp_type_traits", "//libc:__support_fputil_basic_operations", + "//libc:__support_fputil_fenv_impl", "//libc:__support_fputil_fp_bits", + "//libc:hdr_fenv_macros", "//libc:hdr_math_macros", "//libc/test/UnitTest:LibcUnitTest", "//libc/test/UnitTest:fp_test_helpers", diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl index 11d2e5a..9ec3a5e 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl @@ -25,6 +25,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs): srcs = [test_name + ".cpp"] + hdrs, libc_function_deps = ["//libc:func_name".replace("func_name", name)], deps = [ + "//libc/test/UnitTest:fp_test_helpers", "//libc:__support_cpp_algorithm", "//libc:__support_fputil_basic_operations", "//libc:__support_fputil_fenv_impl", @@ -32,9 +33,9 @@ def math_test(name, hdrs = [], deps = [], **kwargs): "//libc:__support_fputil_manipulation_functions", "//libc:__support_fputil_nearest_integer_operations", "//libc:__support_fputil_normal_float", + "//libc:__support_macros_properties_architectures", "//libc:__support_math_extras", "//libc:__support_uint128", - "//libc/test/UnitTest:fp_test_helpers", "//libc:hdr_math_macros", ] + deps, **kwargs diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel index 7d4b9978..2ad2209 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/smoke/BUILD.bazel @@ -117,6 +117,7 @@ math_test( name = "llrintf128", hdrs = ["RoundToIntegerTest.h"], ) + math_test( name = "lroundf128", hdrs = ["RoundToIntegerTest.h"], @@ -135,7 +136,9 @@ libc_support_library( "//libc:__support_cpp_bit", "//libc:__support_cpp_type_traits", "//libc:__support_fputil_basic_operations", + "//libc:__support_fputil_fenv_impl", "//libc:__support_fputil_fp_bits", + "//libc:hdr_fenv_macros", "//libc:hdr_math_macros", "//libc/test/UnitTest:LibcUnitTest", "//libc/test/UnitTest:fp_test_helpers", diff --git a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel index 53a8c9b..6dd1fc4 100644 --- a/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/utils/MPFRWrapper/BUILD.bazel @@ -46,6 +46,7 @@ libc_support_library( "//libc:__support_cpp_type_traits", "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_fpbits_str", + "//libc:__support_macros_properties_types", "//libc:hdr_math_macros", "//libc/test/UnitTest:LibcUnitTest", "//libc/test/UnitTest:fp_test_helpers", diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel index a67f205..aebb05d 100644 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel @@ -164,6 +164,10 @@ td_library( includes = ["include"], ) +llvm_config_target_defines = [ + "LLVM_HAS_{}_TARGET=1".format(t) for t in llvm_targets +] + cc_library( name = "config", hdrs = [ @@ -171,7 +175,7 @@ cc_library( "include/llvm/Config/llvm-config.h", ], copts = llvm_copts, - defines = llvm_config_defines, + defines = llvm_config_defines + llvm_config_target_defines, includes = ["include"], textual_hdrs = [ "include/llvm/Config/AsmParsers.def", @@ -287,6 +291,7 @@ cc_library( linkopts = select({ "@platforms//os:windows": [ "ws2_32.lib", + "ntdll.lib", ], "@platforms//os:freebsd": [ "-pthread", @@ -1754,6 +1759,7 @@ cc_library( ":TransformUtils", ":Vectorize", ":config", + ":ir_headers", ], ) |